diff --git a/PKG-INFO b/PKG-INFO
index ee4435e..2c3b3bd 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,71 +1,71 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 2.5.0
+Version: 2.6.0
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 License-File: LICENSE
 License-File: AUTHORS
 
 swh-indexer
 ============
 
 Tools to compute multiple indexes on SWH's raw contents:
 - content:
   - mimetype
   - ctags
   - language
   - fossology-license
   - metadata
 - revision:
   - metadata
 
 An indexer is in charge of:
 - looking up objects
 - extracting information from those objects
 - store those information in the swh-indexer db
 
 There are multiple indexers working on different object types:
   - content indexer: works with content sha1 hashes
   - revision indexer: works with revision sha1 hashes
   - origin indexer: works with origin identifiers
 
 Indexation procedure:
 - receive batch of ids
 - retrieve the associated data depending on object type
 - compute for that object some index
 - store the result to swh's storage
 
 Current content indexers:
 
 - mimetype (queue swh_indexer_content_mimetype): detect the encoding
   and mimetype
 
 - language (queue swh_indexer_content_language): detect the
   programming language
 
 - ctags (queue swh_indexer_content_ctags): compute tags information
 
 - fossology-license (queue swh_indexer_fossology_license): compute the
   license
 
 - metadata: translate file into translated_metadata dict
 
 Current revision indexers:
 
 - metadata: detects files containing metadata and retrieves translated_metadata
   in content_metadata table in storage or run content indexer to translate
   files.
diff --git a/debian/changelog b/debian/changelog
index bc28259..055db7d 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,1551 +1,1557 @@
-swh-indexer (2.5.0-1~swh1~bpo10+1) buster-swh; urgency=medium
+swh-indexer (2.6.0-1~swh1) unstable-swh; urgency=medium
 
-  * Rebuild for buster-swh
+  * New upstream release 2.6.0     - (tagged by Valentin Lorentz
+    <vlorentz@softwareheritage.org> on 2022-09-12 10:55:11 +0200)
+  * Upstream changes:     - v2.6.0     - * Convert SWHID to str before
+    passing to sentry_sdk.set_tag     - * Fix various crashes     - *
+    github: Add support for 'topics'     - * npm, maven: ignore
+    blatantly invalid licenses and URLs     - * cli: Pass all
+    journal_client config keys to the JournalClient
 
- -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 31 Aug 2022 16:27:19 +0000
+ -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Mon, 12 Sep 2022 09:07:01 +0000
 
 swh-indexer (2.5.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 2.5.0     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2022-08-31 18:10:38
     +0200)
   * Upstream changes:     - v2.5.0     - indexer.cli: Allow batch_size
     configuration on journal client
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 31 Aug 2022 16:20:18 +0000
 
 swh-indexer (2.4.4-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 2.4.4     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2022-08-31 11:26:51 +0200)
   * Upstream changes:     - v2.4.4     - * Revert "metadata: Drop
     unsupported key 'type'"     - * rehash: Call
     objstorage.content_get() with a HashDict instead of single hash
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 31 Aug 2022 09:36:21 +0000
 
 swh-indexer (2.4.3-1~swh2) unstable-swh; urgency=medium
 
   * Drop blocking dependency constraint and bump new version.
 
  -- Antoine R. Dumont (@ardumont) <ardumont@softwareheritage.org>  Tue, 30 Aug 2022 15:37:01 +0200
 
 swh-indexer (2.4.3-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 2.4.3     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2022-08-30 11:09:04
     +0200)
   * Upstream changes:     - v2.4.3     - metadata: Drop unsupported key
     'type'
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Tue, 30 Aug 2022 09:25:28 +0000
 
 swh-indexer (2.4.2-1~swh2) unstable-swh; urgency=medium
 
   * Bump new release
 
  -- Antoine R. Dumont (@ardumont) <ardumont@softwareheritage.org>  Thu, 25 Aug 2022 14:30:54 +0200
 
 swh-indexer (2.4.2-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 2.4.2     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2022-08-25 13:24:10 +0200)
   * Upstream changes:     - v2.4.2     - * Re-trigger Debian build
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 25 Aug 2022 11:33:19 +0000
 
 swh-indexer (2.4.1-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 2.4.1     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2022-08-25 12:22:48 +0200)
   * Upstream changes:     - v2.4.1     - * metadata_dictionary: Fix
     crash on null list item in an uri_field.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 25 Aug 2022 10:32:04 +0000
 
 swh-indexer (2.4.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 2.4.0     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2022-08-25 11:58:05 +0200)
   * Upstream changes:     - v2.4.0     - * metadata_dictionary: Add
     mappings for "*.nuspec" files     - * Refactor metadata mappings
     using rdflib.Graph instead of JSON-LD internally     - * Other
     internal refactorings     - * metadata_dictionary: Add mapping for
     SWORD/Atom with Codemeta
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 25 Aug 2022 10:09:34 +0000
 
 swh-indexer (2.3.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 2.3.0     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2022-08-10 12:16:48 +0200)
   * Upstream changes:     - v2.3.0     - * Tag Sentry events with object
     ids     - * Fix crashes on incorrect URLs in `@id`     - * Fix crash
     on null characters in JSON     - * Fix support of old
     RawExtrinsicMetadata objects with no id
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 10 Aug 2022 10:26:27 +0000
 
 swh-indexer (2.2.2-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 2.2.2     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2022-07-29 13:41:43
     +0200)
   * Upstream changes:     - v2.2.2     - indexer.metadata: Warn and skip
     incomplete entries from the journal
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 29 Jul 2022 11:52:13 +0000
 
 swh-indexer (2.2.1-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 2.2.1     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2022-07-29 10:56:57
     +0200)
   * Upstream changes:     - v2.2.1     - Normalize journal client
     indexer type names
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 29 Jul 2022 09:07:15 +0000
 
 swh-indexer (2.2.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 2.2.0     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2022-07-25 16:23:12
     +0200)
   * Upstream changes:     - v2.2.0     - cli: Add content mimetype
     indexer journal client support     - cli: Add fossology license
     indexer journal client support     - cli: Add extrinsic-metadata
     indexer journal client support     - docs: Fix incorrect terminology
     (term -> property)     - mapping: Fix inconsistent name     - Drop
     decommissioned content indexer: ctags, language
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Mon, 25 Jul 2022 14:33:50 +0000
 
 swh-indexer (2.1.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 2.1.0     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2022-07-21 10:23:44 +0200)
   * Upstream changes:     - v2.1.0     - * DirectoryIndexer: Remove
     incorrect assumption on object types     - * docs: Explain the
     indexation workflow for extrinsic metadata     - * docs: Update
     description of the metadata workflow     - * metadata_dictionary:
     Add mappings for pubspec.yaml     - * Add extrinsic metadata indexer
     - * Add GitHub metadata mapping     - * Refactor Mapping hierarchy
     - * cff: Add checks for value types
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 21 Jul 2022 08:32:23 +0000
 
 swh-indexer (2.0.2-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 2.0.2     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2022-06-22 12:32:41 +0200)
   * Upstream changes:     - v2.0.2     - * Fix mypy issue with swh-
     journal>=1.1.0     - * cff: Ignore invalid yaml files     - * npm:
     Add workaround for mangled package descriptions     - * npm: Fix
     crash when npm description is not a string
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 22 Jun 2022 10:40:25 +0000
 
 swh-indexer (2.0.1-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 2.0.1     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2022-06-10 10:35:15
     +0200)
   * Upstream changes:     - v2.0.1     - upgrades/134: Add missing index
     creation
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 10 Jun 2022 09:17:44 +0000
 
 swh-indexer (2.0.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 2.0.0     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2022-06-03 15:40:32
     +0200)
   * Upstream changes:     - v2.0.0     - Set current_version attribute
     to postgresql datastore     - Add support for indexing from head
     releases     - Replace RevisionMetadataIndexer with
     DirectoryMetadataIndexer     - Add support for running the server
     with 'postgresql' storage cls     - tests: Shorten definition of
     REVISION     - tests: Simplify definition of ORIGINS list     -
     tests: use stock pytest_postgresql factory function     - Rewrite
     origin_head.py as a normal function instead of an indexer     -
     Convert test_origin_head from unittest to pytest
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 03 Jun 2022 13:59:59 +0000
 
 swh-indexer (1.2.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 1.2.0     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2022-06-01 16:44:30 +0200)
   * Upstream changes:     - v1.2.0     - * cli: Add support for running
     "all" indexers in the journal client
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 01 Jun 2022 15:08:39 +0000
 
 swh-indexer (1.1.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 1.1.0     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2022-05-30 15:56:19 +0200)
   * Upstream changes:     - v1.1.0     - * Add support for indexing
     directly from the journal client     - * cff: Do not change
     yaml.SafeLoader globally     - * add missing sentry captures     - *
     Change misleading documentation in swh-indexer/cli.py     - * test
     and typing maintenance
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Mon, 30 May 2022 14:03:54 +0000
 
 swh-indexer (1.0.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 1.0.0     - (tagged by David Douard
     <david.douard@sdfa3.org> on 2022-02-24 17:35:56 +0100)
   * Upstream changes:     - v1.0.0
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 24 Feb 2022 16:42:39 +0000
 
 swh-indexer (0.8.2-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.8.2     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2022-01-12 13:53:22 +0100)
   * Upstream changes:     - v0.8.2     - * tests: Use
     TimestampWithTimezone.from_datetime() instead of the constructor
     - * docs: Use reference instead of absolute link
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 12 Jan 2022 12:56:56 +0000
 
 swh-indexer (0.8.1-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.8.1     - (tagged by Vincent SELLIER
     <vincent.sellier@softwareheritage.org> on 2021-12-21 16:23:37 +0100)
   * Upstream changes:     - v0.8.1     - Changelog:     - tag frozendict
     version to avoid segfaults on the ci
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Tue, 21 Dec 2021 15:28:27 +0000
 
 swh-indexer (0.8.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.8.0     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2021-05-28 16:57:47
     +0200)
   * Upstream changes:     - v0.8.0     - metadata_dictionary: Add
     mapping for CITATION.cff     - metadata/maven: Ignore ill-formed xml
     instead of failing     - metadata: Fix UnboundLocalError in edge
     case     - data/codemeta: sync with official codemeta repo     - Fix
     SingleFileMapping case sensitivity     - Use swh.core 0.14     -
     tox: Add sphinx environments to check sane doc build
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 28 May 2021 15:05:39 +0000
 
 swh-indexer (0.7.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.7.0     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2021-02-03 14:10:16
     +0100)
   * Upstream changes:     - v0.7.0     - Adapt
     origin_get_latest_visit_status according to latest api change
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 03 Feb 2021 13:15:37 +0000
 
 swh-indexer (0.6.4-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.6.4     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2021-02-01 15:06:04
     +0100)
   * Upstream changes:     - v0.6.4     - indexer: Remove pagination
     logic using stream_results() instead.     - ContentPartitionIndexer:
     Do not index the same content multiple times at once.     - Add a
     cli section in the doc     - test_journal_client_cli: Send
     production objects to journal     - test_journal_client: Migrate
     away from mocks     - tests: Use production backends within the
     indexer tests
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Mon, 01 Feb 2021 14:10:18 +0000
 
 swh-indexer (0.6.3-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.6.3     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2020-11-27 14:42:30
     +0100)
   * Upstream changes:     - v0.6.3     - storage.writer: Fix journal
     writer sanitizer function
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 27 Nov 2020 13:46:03 +0000
 
 swh-indexer (0.6.2-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.6.2     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2020-11-27 13:55:53
     +0100)
   * Upstream changes:     - v0.6.2     - BaseRow.unique_key: Don't crash
     when indexer_configuration_id is None.     -
     idx.storage.JournalWriter: pass value_sanitizer to
     get_journal_writer.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 27 Nov 2020 13:00:28 +0000
 
 swh-indexer (0.6.1-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.6.1     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2020-11-27 10:43:14
     +0100)
   * Upstream changes:     - v0.6.1     - Fix test within the debian
     package builds     - refactor tests to pytest
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 27 Nov 2020 09:49:35 +0000
 
 swh-indexer (0.6.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.6.0     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2020-11-26 17:08:03
     +0100)
   * Upstream changes:     - v0.6.0     - indexer.journal_client:
     Subscribe to OriginVisitStatus topic     -
     swh.indexer.cli.journal_client: ensure the minimal configuration
     exists     - Drop all deprecated uses of `args` in component
     factories     - Drop vcversioner from requirements     - Make the
     indexer storage write to the journal.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 26 Nov 2020 16:39:45 +0000
 
 swh-indexer (0.5.0-2~swh1) unstable-swh; urgency=medium
 
   * Move distutils package from python3-swh.indexer to python3-swh.indexer.storage.
 
  -- Nicolas Dandrimont <olasd@debian.org>  Wed, 18 Nov 2020 20:04:23 +0100
 
 swh-indexer (0.5.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.5.0     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2020-11-06 15:25:04 +0100)
   * Upstream changes:     - v0.5.0     - * Remove metadata deletion
     endpoints and algorithms     - * Remove
     conflict_update/policy_update option from BaseIndexer.run()     - *
     Remove conflict_update option from _add() endpoints.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 06 Nov 2020 14:28:05 +0000
 
 swh-indexer (0.4.2-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.4.2     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2020-10-30 17:22:22
     +0100)
   * Upstream changes:     - v0.4.2     - tests.conftest: Fix the indexer
     scheduler initialization     - indexer.cli: Fix missing retries_left
     parameter     - Rename sql files according to new conventions
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 30 Oct 2020 16:24:14 +0000
 
 swh-indexer (0.4.1-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.4.1     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2020-10-16 10:48:51
     +0200)
   * Upstream changes:     - v0.4.1     - test_cli: Remove unneeded
     config args parameter     - api.server: Align configuration
     structure with clients configuration     - storage.api.server: Add
     types to module and refactor tests
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 16 Oct 2020 08:59:09 +0000
 
 swh-indexer (0.4.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.4.0     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2020-10-15 18:17:59
     +0200)
   * Upstream changes:     - v0.4.0     - swh.indexer.storage: Unify
     get_indexer_storage function with others
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 15 Oct 2020 16:19:01 +0000
 
 swh-indexer (0.3.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.3.0     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2020-10-08 13:33:02 +0200)
   * Upstream changes:     - v0.3.0     - * Make indexer-storage
     endpoints use attr-based classes instead of dicts     - * Add more
     typing to indexers and their tests
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 08 Oct 2020 11:35:50 +0000
 
 swh-indexer (0.2.4-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.2.4     - (tagged by David Douard
     <david.douard@sdfa3.org> on 2020-09-25 12:49:04 +0200)
   * Upstream changes:     - v0.2.4
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 25 Sep 2020 10:51:28 +0000
 
 swh-indexer (0.2.3-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.2.3     - (tagged by David Douard
     <david.douard@sdfa3.org> on 2020-09-11 15:12:01 +0200)
   * Upstream changes:     - v0.2.3
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 11 Sep 2020 13:15:41 +0000
 
 swh-indexer (0.2.2-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.2.2     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2020-09-04 13:21:19
     +0200)
   * Upstream changes:     - v0.2.2     - metadata: Adapt to latest
     storage revision_get change     - Tell pytest not to recurse in
     dotdirs.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 04 Sep 2020 11:33:41 +0000
 
 swh-indexer (0.2.1-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.2.1     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2020-08-20 12:59:53 +0200)
   * Upstream changes:     - v0.2.1     - * indexer.rehash: Adapt
     content_get_metadata call to content_get     - * origin_head: Use
     snapshot_get_all_branches instead of snapshot_get.     - * Import
     SortedList, db_transaction_generator, and db_transaction from swh-
     core instead of swh-storage.     - * tests: remove invalid assertion
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 20 Aug 2020 11:03:58 +0000
 
 swh-indexer (0.2.0-1~swh2) unstable-swh; urgency=medium
 
   * Bump dependencies
 
  -- Antoine R. Dumont (@ardumont) <ardumont@softwareheritage.org>  Wed, 06 Aug 2020 13:28:00 +0200
 
 swh-indexer (0.2.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.2.0     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2020-08-06 15:12:44
     +0200)
   * Upstream changes:     - v0.2.0     - Make content indexer work on
     partition of ids
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 06 Aug 2020 13:14:35 +0000
 
 swh-indexer (0.1.1-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.1.1     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2020-07-28 12:42:19
     +0200)
   * Upstream changes:     - v0.1.1     - setup.py: Migrate from
     vcversioner to setuptools-scm     - MANIFEST: Include missing
     conftest.py requirement     - metadata: Update
     swh.storage.origin_get call to latest api change     - Drop
     unsupported "validate" proxy     - tests: Drop deprecated
     storage.origin_add_one use     - Drop useless use of pifpaf     -
     Clean up the swh.scheduler and swh.storage pytest plugin imports
     - tests: Drop obsolete origin visit fields
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Tue, 28 Jul 2020 10:44:54 +0000
 
 swh-indexer (0.1.0-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.1.0     - (tagged by Antoine R. Dumont
     (@ardumont) <ardumont@softwareheritage.org> on 2020-06-23 15:44:15
     +0200)
   * Upstream changes:     - v0.1.0     - origin_head: Retrieve snapshot
     out of the last visit status     - Fix tests according to latest
     internal api changes
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Tue, 23 Jun 2020 13:46:23 +0000
 
 swh-indexer (0.0.171-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.171     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2020-04-23 16:46:52
     +0200)
   * Upstream changes:     - v0.0.171     - cli: Adapt journal client
     instantiation according to latest change     - codemeta: Add
     compatibility with PyLD >= 2.0.0.     - setup: Update the minimum
     required runtime python3 version     - Add a pyproject.toml file to
     target py37 for black     - Enable black     - test: make test data
     properly typed     - indexer.cli.journal_client: Simplify the
     journal client call     - Remove type from origin_add calls     -
     Rename --max-messages to --stop-after-objects.     - tests: Migrate
     to latest swh-storage api change
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 23 Apr 2020 14:49:17 +0000
 
 swh-indexer (0.0.170-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.170     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2020-03-08 09:57:39
     +0100)
   * Upstream changes:     - v0.0.170     - indexer.metadata: Make
     compatible old task format
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Sun, 08 Mar 2020 09:03:59 +0000
 
 swh-indexer (0.0.169-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.169     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2020-03-06 15:19:21
     +0100)
   * Upstream changes:     - v0.0.169     - storage: Add @timed metrics
     on remaining indexer storage endpoints     - indexer.storage: Use
     the correct metrics module     - idx.storage: Add time and counter
     metric to idx_configuration_add     - indexer.storage: Remove
     redundant calls to send_metric     - indexer: Fix mypy issues
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 06 Mar 2020 14:24:50 +0000
 
 swh-indexer (0.0.168-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.168     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2020-03-05 15:48:32
     +0100)
   * Upstream changes:     - v0.0.168     - mimetype: Make the parsing
     more resilient     - storage.fossology_license_add: Fix one insert
     query too many     - tests: Migrate some tests to pytest
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 05 Mar 2020 14:52:27 +0000
 
 swh-indexer (0.0.167-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.167     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2020-03-04 16:33:20
     +0100)
   * Upstream changes:     - v0.0.167     - indexer (revision, origin):
     Fix indexer summary to output a status
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 04 Mar 2020 15:37:59 +0000
 
 swh-indexer (0.0.166-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.166     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2020-03-04 15:46:37 +0100)
   * Upstream changes:     - v0.0.166     - * Fix merging documents with
     @list elements.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 04 Mar 2020 14:50:54 +0000
 
 swh-indexer (0.0.165-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.165     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2020-03-04 15:29:52
     +0100)
   * Upstream changes:     - v0.0.165     - indexers: Fix summary
     computation for range indexers     - tests: Use assertEqual instead
     of deprecated assertEquals
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 04 Mar 2020 14:33:09 +0000
 
 swh-indexer (0.0.164-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.164     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2020-03-04 13:52:15
     +0100)
   * Upstream changes:     - v0.0.164     - range-indexers: Fix hard-
     coded summary key value     - indexers: Improve
     persist_index_computations type     - indexer.metadata: Fix wrong
     update
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 04 Mar 2020 13:00:18 +0000
 
 swh-indexer (0.0.163-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.163     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2020-03-04 11:26:56
     +0100)
   * Upstream changes:     - v0.0.163     - Make indexers return a
     summary of their actions     - swh.indexer.storage: Add metrics to
     add/del endpoints     - indexer.storage: Make add/del endpoints sum
     up added objects count     - indexer: Remove unused next_step
     pattern
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 04 Mar 2020 10:31:03 +0000
 
 swh-indexer (0.0.162-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.162     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2020-02-27 11:01:29
     +0100)
   * Upstream changes:     - v0.0.162     - fossology_license: Improve
     add query endpoint     - pgstorage: Empty temp tables instead of
     dropping them     - indexer.metadata: Fix edge case on unknown
     origin
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 27 Feb 2020 10:09:36 +0000
 
 swh-indexer (0.0.161-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.161     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2020-02-25 12:07:39
     +0100)
   * Upstream changes:     - v0.0.161     - sql/128: Add content_mimetype
     index     - storage.db: Improve content range queries to actually
     finish     - Add a new IndexerStorageArgumentException class, for
     exceptions caused by the client.     - Use swh-storage validation
     proxy.     - Fix type errors with hypothesis 5.5     - Add type
     annotations to indexer classes
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Tue, 25 Feb 2020 11:20:51 +0000
 
 swh-indexer (0.0.160-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.160     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2020-02-05 18:13:16
     +0100)
   * Upstream changes:     - v0.0.160     - Fix missing import
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 05 Feb 2020 17:28:18 +0000
 
 swh-indexer (0.0.159-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.159     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2020-02-05 16:01:03
     +0100)
   * Upstream changes:     - v0.0.159     - Monkey-patch backend classes
     instead of 'get_storage' functions.     - Fix DeprecationWarning
     about get_storage args.     - Move IndexerStorage documentation and
     endpoint paths to a new IndexerStorageInterface class.     -
     conftest: Use module's `get_<storage-backend>` to instantiate
     backend     - docs: Fix sphinx warnings     - Fix merge_documents to
     work with input document with an @id.     - Fix support of VCSs
     whose HEAD branch is an alias.     - Fix type of 'author' in gemspec
     mapping output.     - Fix test_origin_metadata mistakenly broken by
     e50660efca     - Fix several typos reported by pre-commit hooks     -
     Add a pre-commit config file     - Remove unused property-based test
     environment     - Migrate tox.ini to extras = xxx instead of deps =
     .[testing]     - Merge tox test environments     - Drop version
     constraint on pytest     - Include all requirements in MANIFEST.in
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 05 Feb 2020 15:09:42 +0000
 
 swh-indexer (0.0.158-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.158     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2019-11-20 10:26:59
     +0100)
   * Upstream changes:     - v0.0.158     - Re-enable tests for the in-
     memory storage.     - Truncate result list instead of doing a copy.
     - journal client: add support for new origin_visit schema.     - Fix
     alter table rename column syntax on 126->127 upgrade script
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 20 Nov 2019 09:30:37 +0000
 
 swh-indexer (0.0.157-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.157     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-11-08 16:33:36 +0100)
   * Upstream changes:     - v0.0.157     - * migrate storage tests to
     pytest     - * proper pagination for
     IndexerStorage.origin_intrinsic_metadata_search_by_producer
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 08 Nov 2019 15:36:48 +0000
 
 swh-indexer (0.0.156-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.156     - (tagged by Stefano Zacchiroli
     <zack@upsilon.cc> on 2019-11-05 17:36:11 +0100)
   * Upstream changes:     - v0.0.156     - * update indexer for storage
     0.0.156     - * cli: fix max-message handling in the journal-client
     command     - * tests: fix test_metadata.py for frozen entities in
     swh.model.model     - * tests: update tests for storage>=0.0.155
     - * test_metadata typing: use type-specific mappings instead of cast
     - * storage/db.py: drop unused format arg regconfig from query     -
     * typing: minimal changes to make a no-op mypy run pass
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Tue, 05 Nov 2019 16:45:10 +0000
 
 swh-indexer (0.0.155-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.155     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-10-15 14:51:28 +0200)
   * Upstream changes:     - v0.0.155     - * Avoid spamming logs with
     processed %d messages every message     - * tox.ini: Fix py3
     environment to use packaged tests     - * Remove indirection
     swh.indexer.storage.api.wsgi to start server     - * Add a command-
     line tool to run metadata translation.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Tue, 15 Oct 2019 12:55:33 +0000
 
 swh-indexer (0.0.154-1~swh2) unstable-swh; urgency=medium
 
   * Force pg_ctl path
 
  -- Nicolas Dandrimont <olasd@debian.org>  Mon, 07 Oct 2019 16:42:08 +0200
 
 swh-indexer (0.0.154-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.154     - (tagged by Nicolas Dandrimont
     <nicolas@dandrimont.eu> on 2019-10-07 16:34:20 +0200)
   * Upstream changes:     - Release swh.indexer v0.0.154     - Remove
     old scheduler compat code     - Clean up CLI aliases     - Port to
     python-magic instead of file_magic
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Mon, 07 Oct 2019 14:38:47 +0000
 
 swh-indexer (0.0.153-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.153     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2019-09-11 11:46:41
     +0200)
   * Upstream changes:     - v0.0.153     - indexer-storage: Send smaller
     batches to origin_get     - Update
     origin_url/from_revision/metadata_tsvector when conflict_update=True
     - Remove concept of 'minimal set' of metadata     - npm: Fix crash
     on invalid 'author' field     - api/client: use RPCClient instead of
     deprecated SWHRemoteAPI     - api/server: use RPCServerApp instead
     of deprecated SWHServerAPIApp     - tests/utils: Fix various test
     data model issues failing validation
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 11 Sep 2019 09:50:58 +0000
 
 swh-indexer (0.0.152-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.152     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-07-19 11:15:41 +0200)
   * Upstream changes:     - Send smaller batches to revision_get
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 19 Jul 2019 09:20:34 +0000
 
 swh-indexer (0.0.151-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.151     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-07-03 17:58:32 +0200)
   * Upstream changes:     - v0.0.151     - Fix key names in the journal
     client; it crashed in prod.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 03 Jul 2019 16:03:07 +0000
 
 swh-indexer (0.0.150-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.150     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2019-07-03 12:09:43
     +0200)
   * Upstream changes:     - v0.0.150     - indexer.cli: Drop unused
     extra alias `--consumer-id` flag
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 03 Jul 2019 10:20:46 +0000
 
 swh-indexer (0.0.149-1~swh2) unstable-swh; urgency=medium
 
   * No-change: Bump dependency version
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 03 Jul 2019 10:44:12 +0200
 
 swh-indexer (0.0.149-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.149     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2019-07-02 18:11:12
     +0200)
   * Upstream changes:     - v0.0.149     - swh.indexer.cli: Fix
     get_journal_client api call     - sql/upgrades/125: Fix migration
     script
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Tue, 02 Jul 2019 16:26:50 +0000
 
 swh-indexer (0.0.148-1~swh3) unstable-swh; urgency=medium
 
   * Upstream release 0.0.148: Update version dependency
 
  -- Antoine Romain Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 01 Jul 2019 01:50:29 +0100
 
 swh-indexer (0.0.148-1~swh2) unstable-swh; urgency=medium
 
   * Upstream release 0.0.148
 
  -- Antoine Romain Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 01 Jul 2019 01:50:29 +0100
 
 swh-indexer (0.0.148-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.148     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2019-07-01 12:21:32
     +0200)
   * Upstream changes:     - v0.0.148     - Manipulate origin URLs
     instead of origin ids     - journal: create tasks for multiple
     origins     - Tests: Improvements
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Mon, 01 Jul 2019 10:34:26 +0000
 
 swh-indexer (0.0.147-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.147     - (tagged by Antoine Lambert
     <antoine.lambert@inria.fr> on 2019-05-23 11:03:02 +0200)
   * Upstream changes:     - version 0.0.147
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 23 May 2019 09:11:05 +0000
 
 swh-indexer (0.0.146-1~swh2) unstable-swh; urgency=medium
 
   * Remove hypothesis directory
 
  -- Nicolas Dandrimont <olasd@debian.org>  Thu, 18 Apr 2019 18:29:09 +0200
 
 swh-indexer (0.0.146-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.146     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-04-11 11:08:29 +0200)
   * Upstream changes:     - Better explain what the 'string fields' are.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 11 Apr 2019 09:47:24 +0000
 
 swh-indexer (0.0.145-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.145     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-03-15 11:18:25 +0100)
   * Upstream changes:     - Add support for keywords in PKG-INFO.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 15 Mar 2019 11:34:53 +0000
 
 swh-indexer (0.0.144-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.144     - (tagged by Thibault Allançon
     <tallancon@gmail.com> on 2019-03-07 08:16:49 +0100)
   * Upstream changes:     - Fix heterogeneity of names in metadata
     tables
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 14 Mar 2019 13:30:44 +0000
 
 swh-indexer (0.0.143-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.143     - (tagged by Thibault Allançon
     <tallancon@gmail.com> on 2019-03-12 10:18:37 +0100)
   * Upstream changes:     - Use hashutil.MultiHash in
     swh.indexer.tests.test_utils.fill_storage     - Summary: Closes
     T1448     - Reviewers: #reviewers     - Subscribers: swh-public-ci
     - Maniphest Tasks: T1448     - Differential Revision:
     https://forge.softwareheritage.org/D1235
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 13 Mar 2019 10:24:37 +0000
 
 swh-indexer (0.0.142-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.142     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-03-01 14:19:05 +0100)
   * Upstream changes:     - Skip useless requests.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 01 Mar 2019 13:26:06 +0000
 
 swh-indexer (0.0.141-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.141     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-03-01 10:59:54 +0100)
   * Upstream changes:     - Prevent origin metadata indexer from writing
     empty records
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 01 Mar 2019 10:10:56 +0000
 
 swh-indexer (0.0.140-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.140     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-02-25 10:38:52 +0100)
   * Upstream changes:     - Drop the 'context' and 'type' config of
     metadata indexers.     - They are both ignored already.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Mon, 25 Feb 2019 10:40:10 +0000
 
 swh-indexer (0.0.139-1~swh2) unstable-swh; urgency=low
 
   * New release fixing debian build
 
  -- Antoine Romain Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 22 Feb 2019 16:27:47 +0100
 
 swh-indexer (0.0.139-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.139     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2019-02-22 15:53:22
     +0100)
   * Upstream changes:     - v0.0.139     - Clean up no longer used tasks
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 22 Feb 2019 14:59:40 +0000
 
 swh-indexer (0.0.138-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.138     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-02-22 15:30:30 +0100)
   * Upstream changes:     - Make the 'config' argument of
     OriginMetadaIndexer optional again.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 22 Feb 2019 14:37:35 +0000
 
 swh-indexer (0.0.137-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.137     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2019-02-22 10:59:53
     +0100)
   * Upstream changes:     - v0.0.137     - swh.indexer.storage.api.wsgi:
     Open production wsgi entrypoint     - swh.indexer.cli: Move dev app
     entrypoint in dedicated cli     - indexer.storage: Make server load
     explicit configuration and check     - config: use already loaded
     swh config, if any, when instantiating an Indexer     - api: Add
     support for filtering by tool_id to
     origin_intrinsic_metadata_search_by_producer.     - api: Add storage
     endpoint to search metadata by mapping.     - runtime: Remove
     implicit configuration from the metadata indexers.     - debian:
     Remove debian packaging from master branch     - docs: Update
     missing documentation
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 22 Feb 2019 10:11:29 +0000
 
 swh-indexer (0.0.136-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.136     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-02-14 17:09:00 +0100)
   * Upstream changes:     - Don't send 'None' as a revision id to
     storage.revision_get.     - This error wasn't caught before because
     the in-mem storage     - accepts None values, but the pg storage
     doesn't.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 14 Feb 2019 16:22:41 +0000
 
 swh-indexer (0.0.135-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.135     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-02-14 14:45:24 +0100)
   * Upstream changes:     - Fix deduplication of origins when persisting
     origin intrinsic metadata.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 14 Feb 2019 14:32:55 +0000
 
 swh-indexer (0.0.134-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.134     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2019-02-13 23:46:44
     +0100)
   * Upstream changes:     - v0.0.134     - package: Break dependency of
     swh.indexer.storage on swh.indexer.     - api/server: Do not read
     configuration at each request     - metadata: Fix gemspec test     -
     metadata: Prevent OriginMetadataIndexer from sending duplicate     -
     revisions to revision_metadata_add.     - test: Fix bugs found by
     hypothesis.     - test: Use hypothesis to generate adversarial
     inputs.     - Add more type checks in metadata dictionary.     - Add
     checks in the idx_storage that the same content/rev/orig is not     -
     present twice in the new data.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 14 Feb 2019 09:16:15 +0000
 
 swh-indexer (0.0.133-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.133     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2019-02-12 10:28:01
     +0100)
   * Upstream changes:     - v0.0.133     - Migrate BaseDB api calls from
     core to storage     - Improve storage api calls using latest storage
     api     - OriginIndexer: Refactoring     - tests: Refactoring     -
     metadata search: Use index     - indexer metadata: Provide stats per
     origin     - indexer metadata: Update mapping column     - indexer
     metadata: Improve and fix issues
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Tue, 12 Feb 2019 09:34:43 +0000
 
 swh-indexer (0.0.132-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.132     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2019-01-30 15:03:14
     +0100)
   * Upstream changes:     - v0.0.132     - swh/indexer/tasks: Fix range
     indexer tasks     - Maven: Add support for empty XML nodes.     -
     Add support for alternative call format for Gem::Specification.new.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 30 Jan 2019 14:09:48 +0000
 
 swh-indexer (0.0.131-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.131     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2019-01-30 10:56:43
     +0100)
   * Upstream changes:     - v0.0.131     - fix pep8 violations     - fix
     misspellings
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Wed, 30 Jan 2019 10:01:47 +0000
 
 swh-indexer (0.0.129-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.129     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-01-29 14:11:22 +0100)
   * Upstream changes:     - Fix missing config file name change.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Tue, 29 Jan 2019 13:34:17 +0000
 
 swh-indexer (0.0.128-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.128     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-01-25 15:22:52 +0100)
   * Upstream changes:     - Make metadata indexers store the mappings
     used to translate metadata.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Tue, 29 Jan 2019 12:18:16 +0000
 
 swh-indexer (0.0.127-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.127     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-01-15 15:56:49 +0100)
   * Upstream changes:     - Prevent repository normalization from
     crashing on malformed input.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Tue, 15 Jan 2019 16:20:32 +0000
 
 swh-indexer (0.0.126-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.126     - (tagged by Valentin Lorentz
     <vlorentz@softwareheritage.org> on 2019-01-14 11:42:52 +0100)
   * Upstream changes:     - Don't call OriginHeadIndexer.next_step when
     there is no revision.
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Mon, 14 Jan 2019 10:57:34 +0000
 
 swh-indexer (0.0.125-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.125     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2019-01-11 12:01:42
     +0100)
   * Upstream changes:     - v0.0.125     - Add journal client that
     listens for origin visits and schedules     - OriginHead     - Fix
     tests to work with the new version of swh.storage
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Fri, 11 Jan 2019 11:08:51 +0000
 
 swh-indexer (0.0.124-1~swh1) unstable-swh; urgency=medium
 
   * New upstream release 0.0.124     - (tagged by Antoine R. Dumont
     (@ardumont) <antoine.romain.dumont@gmail.com> on 2019-01-08 14:09:32
     +0100)
   * Upstream changes:     - v0.0.124     - indexer: Fix type check on
     indexing result
 
  -- Software Heritage autobuilder (on jenkins-debian1) <jenkins@jenkins-debian1.internal.softwareheritage.org>  Thu, 10 Jan 2019 17:12:07 +0000
 
 swh-indexer (0.0.118-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.118
   * metadata-indexer: Fix setup initialization
   * tests: Refactoring
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 30 Nov 2018 14:50:52 +0100
 
 swh-indexer (0.0.67-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.67
   * mimetype: Migrate to indexed data as text
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 28 Nov 2018 11:35:37 +0100
 
 swh-indexer (0.0.66-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.66
   * range-indexer: Stream indexing range computations
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 27 Nov 2018 11:48:24 +0100
 
 swh-indexer (0.0.65-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.65
   * Fix revision metadata indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 26 Nov 2018 19:30:48 +0100
 
 swh-indexer (0.0.64-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.64
   * indexer: Fix mixed identifier encodings issues
   * Add missing config filename for origin intrinsic metadata indexer.
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 26 Nov 2018 12:20:01 +0100
 
 swh-indexer (0.0.63-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.63
   * Make the OriginMetadataIndexer fetch rev metadata from the storage
   * instead of getting them via the scheduler.
   * Make the 'result_name' key of 'next_step' optional.
   * Add missing return.
   * doc: update index to match new swh-doc format
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 23 Nov 2018 17:56:10 +0100
 
 swh-indexer (0.0.62-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.62
   * metadata indexer: Add empty tool configuration
   * Add fulltext search on origin intrinsic metadata
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 23 Nov 2018 14:25:55 +0100
 
 swh-indexer (0.0.61-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.61
   * indexer: Fix origin indexer's default arguments
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 21 Nov 2018 16:01:50 +0100
 
 swh-indexer (0.0.60-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.60
   * origin_head: Make next step optional
   * tests: Increase coverage
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 21 Nov 2018 12:33:13 +0100
 
 swh-indexer (0.0.59-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.59
   * fossology license: Fix issue on license computation
   * Improve docstrings
   * Fix pep8 violations
   * Increase coverage on content indexers
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 20 Nov 2018 14:27:20 +0100
 
 swh-indexer (0.0.58-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.58
   * Add missing default configuration for fossology license indexer
   * tests: Remove dead code
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 20 Nov 2018 12:06:56 +0100
 
 swh-indexer (0.0.57-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.57
   * storage: Open new endpoint on fossology license range retrieval
   * indexer: Open new fossology license range indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 20 Nov 2018 11:44:57 +0100
 
 swh-indexer (0.0.56-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.56
   * storage.api: Open new endpoints (mimetype range, fossology range)
   * content indexers: Open mimetype and fossology range indexers
   * Remove orchestrator modules
   * tests: Improve coverage
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 19 Nov 2018 11:56:06 +0100
 
 swh-indexer (0.0.55-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.55
   * swh.indexer: Let task reschedule itself through the scheduler
   * Use swh.scheduler instead of celery leaking all around
   * swh.indexer.orchestrator: Fix orchestrator initialization step
   * swh.indexer.tasks: Fix type error when no result or list result
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 29 Oct 2018 10:41:54 +0100
 
 swh-indexer (0.0.54-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.54
   * swh.indexer.tasks: Fix task to use the scheduler's
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 25 Oct 2018 20:13:51 +0200
 
 swh-indexer (0.0.53-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.53
   * swh.indexer.rehash: Migrate to latest swh.model.hashutil.MultiHash
   * indexer: Add the origin intrinsic metadata indexer
   * indexer: Add OriginIndexer and OriginHeadIndexer.
   * indexer.storage: Add the origin intrinsic metadata storage database
   * indexer.storage: Autogenerate the Indexer Storage HTTP API.
   * setup: prepare for pypi upload
   * tests: Add a tox file
   * tests: migrate to pytest
   * tests: Add tests around celery stack
   * docs: Improve documentation and reuse README in generated
     documentation
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 25 Oct 2018 19:03:56 +0200
 
 swh-indexer (0.0.52-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.52
   * swh.indexer.storage: Refactor fossology license get (first external
   * contribution, cf. /CONTRIBUTORS)
   * swh.indexer.storage: Fix typo in invariable name metadata
   * swh.indexer.storage: No longer use temp table when reading data
   * swh.indexer.storage: Clean up unused import
   * swh.indexer.storage: Remove dead entry points origin_metadata*
   * swh.indexer.storage: Update docstrings information and format
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 13 Jun 2018 11:20:40 +0200
 
 swh-indexer (0.0.51-1~swh1) unstable-swh; urgency=medium
 
   * Release swh.indexer v0.0.51
   * Update for new db_transaction{,_generator}
 
  -- Nicolas Dandrimont <nicolas@dandrimont.eu>  Tue, 05 Jun 2018 14:10:39 +0200
 
 swh-indexer (0.0.50-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.50
   * swh.indexer.api.client: Permit to specify the query timeout option
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 24 May 2018 12:19:06 +0200
 
 swh-indexer (0.0.49-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.49
   * test_storage: Instantiate the tools during tests' setUp phase
   * test_storage: Deallocate storage during teardown step
   * test_storage: Make storage test fixture connect to postgres itself
   * storage.api.server: Only instantiate storage backend once per import
   * Use thread-aware psycopg2 connection pooling for database access
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 14 May 2018 11:09:30 +0200
 
 swh-indexer (0.0.48-1~swh1) unstable-swh; urgency=medium
 
   * Release swh.indexer v0.0.48
   * Update for new swh.storage
 
  -- Nicolas Dandrimont <nicolas@dandrimont.eu>  Sat, 12 May 2018 18:30:10 +0200
 
 swh-indexer (0.0.47-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.47
   * d/control: Fix runtime typo in packaging dependency
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 07 Dec 2017 16:54:49 +0100
 
 swh-indexer (0.0.46-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.46
   * Split swh-indexer packages in 2 python3-swh.indexer.storage and
   * python3-swh.indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 07 Dec 2017 16:18:04 +0100
 
 swh-indexer (0.0.45-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.45
   * Fix usual error raised when deploying
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 07 Dec 2017 15:01:01 +0100
 
 swh-indexer (0.0.44-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.44
   * swh.indexer: Make indexer use their own storage
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 07 Dec 2017 13:20:44 +0100
 
 swh-indexer (0.0.43-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.43
   * swh.indexer.mimetype: Work around problem in detection
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 29 Nov 2017 10:26:11 +0100
 
 swh-indexer (0.0.42-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.42
   * swh.indexer: Make indexers register tools in prepare method
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 24 Nov 2017 11:26:03 +0100
 
 swh-indexer (0.0.41-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.41
   * mimetype: Use magic library api instead of parsing `file` cli output
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 20 Nov 2017 13:05:29 +0100
 
 swh-indexer (0.0.39-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.39
   * swh.indexer.producer: Fix argument to match the abstract definition
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 19 Oct 2017 10:03:44 +0200
 
 swh-indexer (0.0.38-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.38
   * swh.indexer.indexer: Fix argument to match the abstract definition
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 18 Oct 2017 19:57:47 +0200
 
 swh-indexer (0.0.37-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.37
   * swh.indexer.indexer: Fix argument to match the abstract definition
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 18 Oct 2017 18:59:42 +0200
 
 swh-indexer (0.0.36-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.36
   * packaging: Cleanup
   * codemeta: Adding codemeta.json file to document metadata
   * swh.indexer.mimetype: Fix edge case regarding empty raw content
   * docs: sanitize docstrings for sphinx documentation generation
   * swh.indexer.metadata: Add RevisionMetadataIndexer
   * swh.indexer.metadata: Add ContentMetadataIndexer
   * swh.indexer: Refactor base class to improve inheritance
   * swh.indexer.metadata: First draft of the metadata content indexer
   * for npm (package.json)
   * swh.indexer.tests: Added tests for language indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 18 Oct 2017 16:24:24 +0200
 
 swh-indexer (0.0.35-1~swh1) unstable-swh; urgency=medium
 
   * Release swh.indexer 0.0.35
   * Update tasks to new swh.scheduler API
 
  -- Nicolas Dandrimont <nicolas@dandrimont.eu>  Mon, 12 Jun 2017 18:02:04 +0200
 
 swh-indexer (0.0.34-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.34
   * Fix unbound local error on edge case
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 07 Jun 2017 11:23:29 +0200
 
 swh-indexer (0.0.33-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.33
   * language indexer: Improve edge case policy
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 07 Jun 2017 11:02:47 +0200
 
 swh-indexer (0.0.32-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.32
   * Update fossology license to use the latest swh-storage
   * Improve language indexer to deal with potential error on bad
   * chunking
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 06 Jun 2017 18:13:40 +0200
 
 swh-indexer (0.0.31-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.31
   * Reduce log verbosity on language indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 02 Jun 2017 19:08:52 +0200
 
 swh-indexer (0.0.30-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.30
   * Fix wrong default configuration
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 02 Jun 2017 18:01:27 +0200
 
 swh-indexer (0.0.29-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.29
   * Update indexer to resolve indexer configuration identifier
   * Adapt language indexer to use partial raw content
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 02 Jun 2017 16:21:27 +0200
 
 swh-indexer (0.0.28-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.28
   * Add error resilience to fossology indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 22 May 2017 12:57:55 +0200
 
 swh-indexer (0.0.27-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.27
   * swh.indexer.language: Incremental encoding detection
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 17 May 2017 18:04:27 +0200
 
 swh-indexer (0.0.26-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.26
   * swh.indexer.orchestrator: Add batch size option per indexer
   * Log caught exception in a unified manner
   * Add rescheduling option (not by default) on rehash + indexers
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 17 May 2017 14:08:07 +0200
 
 swh-indexer (0.0.25-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.25
   * Add reschedule on error parameter for indexers
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 12 May 2017 12:13:15 +0200
 
 swh-indexer (0.0.24-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.24
   * Make rehash indexer more resilient to errors by rescheduling
     contents
   * in error (be it reading or updating problems)
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 04 May 2017 14:22:43 +0200
 
 swh-indexer (0.0.23-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.23
   * Improve producer to optionally make it synchronous
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 03 May 2017 15:29:44 +0200
 
 swh-indexer (0.0.22-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.22
   * Improve mimetype indexer implementation
   * Make the chaining option in the mimetype indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 02 May 2017 16:31:14 +0200
 
 swh-indexer (0.0.21-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.21
   * swh.indexer.rehash: Actually make the worker log
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 02 May 2017 14:28:55 +0200
 
 swh-indexer (0.0.20-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.20
   * swh.indexer.rehash:
   * Improve reading from objstorage only when needed
   * Fix empty file use case (which was skipped)
   * Add logging
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 28 Apr 2017 09:39:09 +0200
 
 swh-indexer (0.0.19-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.19
   * Fix rehash indexer's default configuration file
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 27 Apr 2017 19:17:20 +0200
 
 swh-indexer (0.0.18-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.18
   * Add new rehash indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 26 Apr 2017 15:23:02 +0200
 
 swh-indexer (0.0.17-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.17
   * Add information on indexer tools (T610)
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 02 Dec 2016 18:32:54 +0100
 
 swh-indexer (0.0.16-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.16
   * bug fixes
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 15 Nov 2016 19:31:52 +0100
 
 swh-indexer (0.0.15-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.15
   * Improve message producer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 15 Nov 2016 18:16:42 +0100
 
 swh-indexer (0.0.14-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.14
   * Update package dependency on fossology-nomossa
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 15 Nov 2016 14:13:41 +0100
 
 swh-indexer (0.0.13-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.13
   * Add new license indexer
   * ctags indexer: align behavior with other indexers regarding the
   * conflict update policy
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 14 Nov 2016 14:13:34 +0100
 
 swh-indexer (0.0.12-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.12
   * Add runtime dependency on universal-ctags
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 04 Nov 2016 13:59:59 +0100
 
 swh-indexer (0.0.11-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.11
   * Remove dependency on exuberant-ctags
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 03 Nov 2016 16:13:26 +0100
 
 swh-indexer (0.0.10-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.10
   * Add ctags indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 20 Oct 2016 16:12:42 +0200
 
 swh-indexer (0.0.9-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.9
   * d/control: Bump dependency to latest python3-swh.storage api
   * mimetype: Use the charset to filter out data
   * orchestrator: Separate 2 distincts orchestrators (one for all
   * contents, one for text contents)
   * mimetype: once index computed, send text contents to text
     orchestrator
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 13 Oct 2016 15:28:17 +0200
 
 swh-indexer (0.0.8-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.8
   * Separate configuration file per indexer (no need for language)
   * Rename module file_properties to mimetype consistently with other
   * layers
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Sat, 08 Oct 2016 11:46:29 +0200
 
 swh-indexer (0.0.7-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.7
   * Adapt indexer language and mimetype to store result in storage.
   * Clean up obsolete code
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Sat, 08 Oct 2016 10:26:08 +0200
 
 swh-indexer (0.0.6-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.6
   * Fix multiple issues on production
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 30 Sep 2016 17:00:11 +0200
 
 swh-indexer (0.0.5-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.5
   * Fix debian/control dependency issue
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 30 Sep 2016 16:06:20 +0200
 
 swh-indexer (0.0.4-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.4
   * Upgrade dependencies issues
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 30 Sep 2016 16:01:52 +0200
 
 swh-indexer (0.0.3-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.3
   * Add encoding detection
   * Use encoding to improve language detection
   * bypass language detection for binary files
   * bypass ctags for binary files or decoding failure file
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 30 Sep 2016 12:30:11 +0200
 
 swh-indexer (0.0.2-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.2
   * Provide one possible sha1's name for the multiple tools to ease
   * information extrapolation
   * Fix debian package dependency issue
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 29 Sep 2016 21:45:44 +0200
 
 swh-indexer (0.0.1-1~swh1) unstable-swh; urgency=medium
 
   * Initial release
   * v0.0.1
   * First implementation on poc
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 28 Sep 2016 23:40:13 +0200
diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO
index ee4435e..2c3b3bd 100644
--- a/swh.indexer.egg-info/PKG-INFO
+++ b/swh.indexer.egg-info/PKG-INFO
@@ -1,71 +1,71 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 2.5.0
+Version: 2.6.0
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
 Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
 Provides-Extra: testing
 License-File: LICENSE
 License-File: AUTHORS
 
 swh-indexer
 ============
 
 Tools to compute multiple indexes on SWH's raw contents:
 - content:
   - mimetype
   - ctags
   - language
   - fossology-license
   - metadata
 - revision:
   - metadata
 
 An indexer is in charge of:
 - looking up objects
 - extracting information from those objects
 - store those information in the swh-indexer db
 
 There are multiple indexers working on different object types:
   - content indexer: works with content sha1 hashes
   - revision indexer: works with revision sha1 hashes
   - origin indexer: works with origin identifiers
 
 Indexation procedure:
 - receive batch of ids
 - retrieve the associated data depending on object type
 - compute for that object some index
 - store the result to swh's storage
 
 Current content indexers:
 
 - mimetype (queue swh_indexer_content_mimetype): detect the encoding
   and mimetype
 
 - language (queue swh_indexer_content_language): detect the
   programming language
 
 - ctags (queue swh_indexer_content_ctags): compute tags information
 
 - fossology-license (queue swh_indexer_fossology_license): compute the
   license
 
 - metadata: translate file into translated_metadata dict
 
 Current revision indexers:
 
 - metadata: detects files containing metadata and retrieves translated_metadata
   in content_metadata table in storage or run content indexer to translate
   files.
diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py
index 2c7318f..939b4b1 100644
--- a/swh/indexer/cli.py
+++ b/swh/indexer/cli.py
@@ -1,407 +1,408 @@
 # Copyright (C) 2019-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from typing import Callable, Dict, Iterator, List, Optional
 
 # WARNING: do not import unnecessary things here to keep cli startup time under
 # control
 import click
 
 from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup
 from swh.core.cli import swh as swh_cli_group
 
 
 @swh_cli_group.group(
     name="indexer", context_settings=CONTEXT_SETTINGS, cls=AliasedGroup
 )
 @click.option(
     "--config-file",
     "-C",
     default=None,
     type=click.Path(
         exists=True,
         dir_okay=False,
     ),
     help="Configuration file.",
 )
 @click.pass_context
 def indexer_cli_group(ctx, config_file):
     """Software Heritage Indexer tools.
 
     The Indexer is used to mine the content of the archive and extract derived
     information from archive source code artifacts.
 
     """
     from swh.core import config
 
     ctx.ensure_object(dict)
     conf = config.read(config_file)
     ctx.obj["config"] = conf
 
 
 def _get_api(getter, config, config_key, url):
     if url:
         config[config_key] = {"cls": "remote", "url": url}
     elif config_key not in config:
         raise click.ClickException("Missing configuration for {}".format(config_key))
     return getter(**config[config_key])
 
 
 @indexer_cli_group.group("mapping")
 def mapping():
     """Manage Software Heritage Indexer mappings."""
     pass
 
 
 @mapping.command("list")
 def mapping_list():
     """Prints the list of known mappings."""
     from swh.indexer import metadata_dictionary
 
     mapping_names = [mapping.name for mapping in metadata_dictionary.MAPPINGS.values()]
     mapping_names.sort()
     for mapping_name in mapping_names:
         click.echo(mapping_name)
 
 
 @mapping.command("list-terms")
 @click.option(
     "--exclude-mapping", multiple=True, help="Exclude the given mapping from the output"
 )
 @click.option(
     "--concise",
     is_flag=True,
     default=False,
     help="Don't print the list of mappings supporting each term.",
 )
 def mapping_list_terms(concise, exclude_mapping):
     """Prints the list of known CodeMeta terms, and which mappings
     support them."""
     from swh.indexer import metadata_dictionary
 
     properties = metadata_dictionary.list_terms()
     for (property_name, supported_mappings) in sorted(properties.items()):
         supported_mappings = {m.name for m in supported_mappings}
         supported_mappings -= set(exclude_mapping)
         if supported_mappings:
             if concise:
                 click.echo(property_name)
             else:
                 click.echo("{}:".format(property_name))
                 click.echo("\t" + ", ".join(sorted(supported_mappings)))
 
 
 @mapping.command("translate")
 @click.argument("mapping-name")
 @click.argument("file", type=click.File("rb"))
 def mapping_translate(mapping_name, file):
     """Translates file from mapping-name to codemeta format."""
     import json
 
     from swh.indexer import metadata_dictionary
 
     mapping_cls = [
         cls for cls in metadata_dictionary.MAPPINGS.values() if cls.name == mapping_name
     ]
     if not mapping_cls:
         raise click.ClickException("Unknown mapping {}".format(mapping_name))
     assert len(mapping_cls) == 1
     mapping_cls = mapping_cls[0]
     mapping = mapping_cls()
     codemeta_doc = mapping.translate(file.read())
     click.echo(json.dumps(codemeta_doc, indent=4))
 
 
 @indexer_cli_group.group("schedule")
 @click.option("--scheduler-url", "-s", default=None, help="URL of the scheduler API")
 @click.option(
     "--indexer-storage-url", "-i", default=None, help="URL of the indexer storage API"
 )
 @click.option(
     "--storage-url", "-g", default=None, help="URL of the (graph) storage API"
 )
 @click.option(
     "--dry-run/--no-dry-run",
     is_flag=True,
     default=False,
     help="List only what would be scheduled.",
 )
 @click.pass_context
 def schedule(ctx, scheduler_url, storage_url, indexer_storage_url, dry_run):
     """Manipulate Software Heritage Indexer tasks.
 
     Via SWH Scheduler's API."""
     from swh.indexer.storage import get_indexer_storage
     from swh.scheduler import get_scheduler
     from swh.storage import get_storage
 
     ctx.obj["indexer_storage"] = _get_api(
         get_indexer_storage, ctx.obj["config"], "indexer_storage", indexer_storage_url
     )
     ctx.obj["storage"] = _get_api(
         get_storage, ctx.obj["config"], "storage", storage_url
     )
     ctx.obj["scheduler"] = _get_api(
         get_scheduler, ctx.obj["config"], "scheduler", scheduler_url
     )
     if dry_run:
         ctx.obj["scheduler"] = None
 
 
 def list_origins_by_producer(idx_storage, mappings, tool_ids) -> Iterator[str]:
     next_page_token = ""
     limit = 10000
     while next_page_token is not None:
         result = idx_storage.origin_intrinsic_metadata_search_by_producer(
             page_token=next_page_token,
             limit=limit,
             ids_only=True,
             mappings=mappings or None,
             tool_ids=tool_ids or None,
         )
         next_page_token = result.next_page_token
         yield from result.results
 
 
 @schedule.command("reindex_origin_metadata")
 @click.option(
     "--batch-size",
     "-b",
     "origin_batch_size",
     default=10,
     show_default=True,
     type=int,
     help="Number of origins per task",
 )
 @click.option(
     "--tool-id",
     "-t",
     "tool_ids",
     type=int,
     multiple=True,
     help="Restrict search of old metadata to this/these tool ids.",
 )
 @click.option(
     "--mapping",
     "-m",
     "mappings",
     multiple=True,
     help="Mapping(s) that should be re-scheduled (eg. 'npm', 'gemspec', 'maven')",
 )
 @click.option(
     "--task-type",
     default="index-origin-metadata",
     show_default=True,
     help="Name of the task type to schedule.",
 )
 @click.pass_context
 def schedule_origin_metadata_reindex(
     ctx, origin_batch_size, tool_ids, mappings, task_type
 ):
     """Schedules indexing tasks for origins that were already indexed."""
     from swh.scheduler.cli_utils import schedule_origin_batches
 
     idx_storage = ctx.obj["indexer_storage"]
     scheduler = ctx.obj["scheduler"]
 
     origins = list_origins_by_producer(idx_storage, mappings, tool_ids)
 
     kwargs = {"retries_left": 1}
     schedule_origin_batches(scheduler, task_type, origins, origin_batch_size, kwargs)
 
 
 @indexer_cli_group.command("journal-client")
 @click.argument(
     "indexer",
     type=click.Choice(
         [
             "origin_intrinsic_metadata",
             "extrinsic_metadata",
             "content_mimetype",
             "content_fossology_license",
             "*",
         ]
     ),
     required=False
     # TODO: remove required=False after we stop using it
 )
 @click.option("--scheduler-url", "-s", default=None, help="URL of the scheduler API")
 @click.option(
     "--origin-metadata-task-type",
     default="index-origin-metadata",
     help="Name of the task running the origin metadata indexer.",
 )
 @click.option(
     "--broker", "brokers", type=str, multiple=True, help="Kafka broker to connect to."
 )
 @click.option(
     "--prefix", type=str, default=None, help="Prefix of Kafka topic names to read from."
 )
 @click.option("--group-id", type=str, help="Consumer/group id for reading from Kafka.")
 @click.option(
     "--stop-after-objects",
     "-m",
     default=None,
     type=int,
     help="Maximum number of objects to replay. Default is to run forever.",
 )
 @click.option(
     "--batch-size",
     "-b",
     default=None,
     type=int,
     help="Batch size. Default is 200.",
 )
 @click.pass_context
 def journal_client(
     ctx,
     indexer: Optional[str],
     scheduler_url: str,
     origin_metadata_task_type: str,
     brokers: List[str],
     prefix: str,
     group_id: str,
     stop_after_objects: Optional[int],
     batch_size: Optional[int],
 ):
     """
     Listens for new objects from the SWH Journal, and either:
 
     * runs the indexer with the name passed as argument, if any
     * schedules tasks to run relevant indexers (currently, only
       origin_intrinsic_metadata) on these new objects otherwise.
 
     Passing '*' as indexer name runs all indexers.
     """
     import functools
     import warnings
 
     from swh.indexer.indexer import BaseIndexer, ObjectsDict
     from swh.indexer.journal_client import process_journal_objects
     from swh.journal.client import get_journal_client
     from swh.scheduler import get_scheduler
 
     cfg = ctx.obj["config"]
     journal_cfg = cfg.get("journal", {})
 
     scheduler = _get_api(get_scheduler, cfg, "scheduler", scheduler_url)
 
-    brokers = brokers or journal_cfg.get("brokers")
-    if not brokers:
+    if brokers:
+        journal_cfg["brokers"] = brokers
+    if not journal_cfg.get("brokers"):
         raise ValueError("The brokers configuration is mandatory.")
 
-    prefix = prefix or journal_cfg.get("prefix")
-    group_id = group_id or journal_cfg.get("group_id")
+    if prefix:
+        journal_cfg["prefix"] = prefix
+    if group_id:
+        journal_cfg["group_id"] = group_id
     origin_metadata_task_type = origin_metadata_task_type or journal_cfg.get(
         "origin_metadata_task_type"
     )
-    stop_after_objects = stop_after_objects or journal_cfg.get("stop_after_objects")
-    batch_size = batch_size or journal_cfg.get("batch_size", 200)
+    if stop_after_objects:
+        journal_cfg["stop_after_objects"] = stop_after_objects
+    if batch_size:
+        journal_cfg["batch_size"] = batch_size
 
     object_types = set()
     worker_fns: List[Callable[[ObjectsDict], Dict]] = []
 
     if indexer is None:
         warnings.warn(
             "'swh indexer journal-client' with no argument creates scheduler tasks "
             "to index, rather than index directly.",
             DeprecationWarning,
         )
         object_types.add("origin_visit_status")
         worker_fns.append(
             functools.partial(
                 process_journal_objects,
                 scheduler=scheduler,
                 task_names={
                     "origin_metadata": origin_metadata_task_type,
                 },
             )
         )
 
     idx: Optional[BaseIndexer] = None
 
     if indexer in ("origin_intrinsic_metadata", "*"):
         from swh.indexer.metadata import OriginMetadataIndexer
 
         object_types.add("origin_visit_status")
         idx = OriginMetadataIndexer()
         idx.catch_exceptions = False  # don't commit offsets if indexation failed
         worker_fns.append(idx.process_journal_objects)
 
     if indexer in ("extrinsic_metadata", "*"):
         from swh.indexer.metadata import ExtrinsicMetadataIndexer
 
         object_types.add("raw_extrinsic_metadata")
         idx = ExtrinsicMetadataIndexer()
         idx.catch_exceptions = False  # don't commit offsets if indexation failed
         worker_fns.append(idx.process_journal_objects)
 
     if indexer in ("content_mimetype", "*"):
         from swh.indexer.mimetype import MimetypeIndexer
 
         object_types.add("content")
         idx = MimetypeIndexer()
         idx.catch_exceptions = False  # don't commit offsets if indexation failed
         worker_fns.append(idx.process_journal_objects)
 
     if indexer in ("content_fossology_license", "*"):
         from swh.indexer.fossology_license import FossologyLicenseIndexer
 
         object_types.add("content")
         idx = FossologyLicenseIndexer()
         idx.catch_exceptions = False  # don't commit offsets if indexation failed
         worker_fns.append(idx.process_journal_objects)
 
     if not worker_fns:
         raise click.ClickException(f"Unknown indexer: {indexer}")
 
     client = get_journal_client(
         cls="kafka",
-        brokers=brokers,
-        prefix=prefix,
-        group_id=group_id,
         object_types=list(object_types),
-        stop_after_objects=stop_after_objects,
-        batch_size=batch_size,
+        **journal_cfg,
     )
 
     def worker_fn(objects: ObjectsDict):
         for fn in worker_fns:
             fn(objects)
 
     try:
         client.process(worker_fn)
     except KeyboardInterrupt:
         ctx.exit(0)
     else:
         print("Done.")
     finally:
         client.close()
 
 
 @indexer_cli_group.command("rpc-serve")
 @click.argument("config-path", required=True)
 @click.option("--host", default="0.0.0.0", help="Host to run the server")
 @click.option("--port", default=5007, type=click.INT, help="Binding port of the server")
 @click.option(
     "--debug/--nodebug",
     default=True,
     help="Indicates if the server should run in debug mode",
 )
 def rpc_server(config_path, host, port, debug):
     """Starts a Software Heritage Indexer RPC HTTP server."""
     from swh.indexer.storage.api.server import app, load_and_check_config
 
     api_cfg = load_and_check_config(config_path, type="any")
     app.config.update(api_cfg)
     app.run(host, port=int(port), debug=bool(debug))
 
 
 def main():
     return indexer_cli_group(auto_envvar_prefix="SWH_INDEXER")
 
 
 if __name__ == "__main__":
     main()
diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
index 566ab98..d9b3eb3 100644
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -1,545 +1,561 @@
 # Copyright (C) 2017-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from copy import deepcopy
+import itertools
+import logging
+import time
 from typing import (
     Any,
     Callable,
     Dict,
     Iterable,
     Iterator,
     List,
     Optional,
     Tuple,
     TypeVar,
     cast,
 )
 from urllib.parse import urlparse
 
 import sentry_sdk
 
 from swh.core.config import merge_configs
 from swh.core.utils import grouper
 from swh.indexer.codemeta import merge_documents
 from swh.indexer.indexer import (
     BaseIndexer,
     ContentIndexer,
     DirectoryIndexer,
     ObjectsDict,
     OriginIndexer,
 )
 from swh.indexer.metadata_detector import detect_metadata
 from swh.indexer.metadata_dictionary import EXTRINSIC_MAPPINGS, INTRINSIC_MAPPINGS
 from swh.indexer.metadata_dictionary.base import DirectoryLsEntry
 from swh.indexer.origin_head import get_head_swhid
 from swh.indexer.storage import INDEXER_CFG_KEY, Sha1
 from swh.indexer.storage.model import (
     ContentMetadataRow,
     DirectoryIntrinsicMetadataRow,
     OriginExtrinsicMetadataRow,
     OriginIntrinsicMetadataRow,
 )
 from swh.model import hashutil
 from swh.model.model import Directory, MetadataAuthorityType
 from swh.model.model import ObjectType as ModelObjectType
 from swh.model.model import Origin, RawExtrinsicMetadata, Sha1Git
 from swh.model.swhids import CoreSWHID, ExtendedObjectType, ObjectType
 
 REVISION_GET_BATCH_SIZE = 10
 RELEASE_GET_BATCH_SIZE = 10
 ORIGIN_GET_BATCH_SIZE = 10
 
 
 T1 = TypeVar("T1")
 T2 = TypeVar("T2")
 
+logger = logging.getLogger(__name__)
+
 
 def call_with_batches(
     f: Callable[[List[T1]], Iterable[T2]],
     args: List[T1],
     batch_size: int,
 ) -> Iterator[T2]:
     """Calls a function with batches of args, and concatenates the results."""
     groups = grouper(args, batch_size)
     for group in groups:
         yield from f(list(group))
 
 
 class ExtrinsicMetadataIndexer(
     BaseIndexer[Sha1Git, RawExtrinsicMetadata, OriginExtrinsicMetadataRow]
 ):
     def process_journal_objects(self, objects: ObjectsDict) -> Dict:
         summary: Dict[str, Any] = {"status": "uneventful"}
         try:
-            results = []
+            results = {}
             for item in objects.get("raw_extrinsic_metadata", []):
                 remd = RawExtrinsicMetadata.from_dict(item)
-                sentry_sdk.set_tag("swh-indexer-remd-swhid", remd.swhid())
-                results.extend(self.index(remd.id, data=remd))
+                sentry_sdk.set_tag("swh-indexer-remd-swhid", str(remd.swhid()))
+                results[remd.target] = self.index(remd.id, data=remd)
         except Exception:
             if not self.catch_exceptions:
                 raise
             summary["status"] = "failed"
             return summary
 
-        summary_persist = self.persist_index_computations(results)
-        self.results = results
+        self.results = list(itertools.chain.from_iterable(results.values()))
+        summary_persist = self.persist_index_computations(self.results)
         if summary_persist:
             for value in summary_persist.values():
                 if value > 0:
                     summary["status"] = "eventful"
             summary.update(summary_persist)
         return summary
 
     def index(
         self,
         id: Sha1Git,
         data: Optional[RawExtrinsicMetadata],
         **kwargs,
     ) -> List[OriginExtrinsicMetadataRow]:
         if data is None:
             raise NotImplementedError(
                 "ExtrinsicMetadataIndexer.index() without RawExtrinsicMetadata data"
             )
         if data.target.object_type != ExtendedObjectType.ORIGIN:
             # other types are not supported yet
             return []
 
         if data.authority.type != MetadataAuthorityType.FORGE:
             # metadata provided by a third-party; don't trust it
             # (technically this could be handled below, but we check it here
             # to return early; sparing a translation and origin lookup)
             # TODO: add ways to define trusted authorities
             return []
 
         metadata_items = []
         mappings: List[str] = []
         for mapping_cls in EXTRINSIC_MAPPINGS.values():
             if data.format in mapping_cls.extrinsic_metadata_formats():
                 mapping = mapping_cls()
                 metadata_item = mapping.translate(data.metadata)
                 if metadata_item is not None:
                     metadata_items.append(metadata_item)
                     mappings.append(mapping.name)
 
         if not metadata_items:
             # Don't have any mapping to parse it, ignore
             return []
 
         # TODO: batch requests to origin_get_by_sha1()
-        origins = self.storage.origin_get_by_sha1([data.target.object_id])
-        try:
-            (origin,) = origins
-            if origin is None:
-                raise ValueError()
-        except ValueError:
+        for _ in range(6):
+            origins = self.storage.origin_get_by_sha1([data.target.object_id])
+            try:
+                (origin,) = origins
+                if origin is not None:
+                    break
+            except ValueError:
+                pass
+            # The origin does not exist. This may be due to some replication lag
+            # between the loader's DB/journal and the DB we are consuming from.
+            # Wait a bit and try again
+            logger.debug("Origin %s not found, sleeping for 10s.", data.target)
+            time.sleep(10)
+        else:
+            # Does not exist, or replication lag > 60s.
             raise ValueError(f"Unknown origin {data.target}") from None
 
         if urlparse(data.authority.url).netloc != urlparse(origin["url"]).netloc:
             # metadata provided by a third-party; don't trust it
             # TODO: add ways to define trusted authorities
             return []
 
         metadata = merge_documents(metadata_items)
 
         return [
             OriginExtrinsicMetadataRow(
                 id=origin["url"],
                 indexer_configuration_id=self.tool["id"],
                 from_remd_id=data.id,
                 mappings=mappings,
                 metadata=metadata,
             )
         ]
 
     def persist_index_computations(
         self, results: List[OriginExtrinsicMetadataRow]
     ) -> Dict[str, int]:
         """Persist the results in storage."""
         return self.idx_storage.origin_extrinsic_metadata_add(results)
 
 
 class ContentMetadataIndexer(ContentIndexer[ContentMetadataRow]):
     """Content-level indexer
 
     This indexer is in charge of:
 
     - filtering out content already indexed in content_metadata
     - reading content from objstorage with the content's id sha1
     - computing metadata by given context
     - using the metadata_dictionary as the 'swh-metadata-translator' tool
     - store result in content_metadata table
 
     """
 
     def filter(self, ids):
         """Filter out known sha1s and return only missing ones."""
         yield from self.idx_storage.content_metadata_missing(
             (
                 {
                     "id": sha1,
                     "indexer_configuration_id": self.tool["id"],
                 }
                 for sha1 in ids
             )
         )
 
     def index(
         self,
         id: Sha1,
         data: Optional[bytes] = None,
         log_suffix="unknown directory",
         **kwargs,
     ) -> List[ContentMetadataRow]:
         """Index sha1s' content and store result.
 
         Args:
             id: content's identifier
             data: raw content in bytes
 
         Returns:
             dict: dictionary representing a content_metadata. If the
             translation wasn't successful the metadata keys will
             be returned as None
 
         """
         assert isinstance(id, bytes)
         assert data is not None
         metadata = None
         try:
             mapping_name = self.tool["tool_configuration"]["context"]
             log_suffix += ", content_id=%s" % hashutil.hash_to_hex(id)
             metadata = INTRINSIC_MAPPINGS[mapping_name](log_suffix).translate(data)
         except Exception:
             self.log.exception(
                 "Problem during metadata translation "
                 "for content %s" % hashutil.hash_to_hex(id)
             )
             sentry_sdk.capture_exception()
         if metadata is None:
             return []
         return [
             ContentMetadataRow(
                 id=id,
                 indexer_configuration_id=self.tool["id"],
                 metadata=metadata,
             )
         ]
 
     def persist_index_computations(
         self, results: List[ContentMetadataRow]
     ) -> Dict[str, int]:
         """Persist the results in storage."""
         return self.idx_storage.content_metadata_add(results)
 
 
 DEFAULT_CONFIG: Dict[str, Any] = {
     "tools": {
         "name": "swh-metadata-detector",
         "version": "0.0.2",
         "configuration": {},
     },
 }
 
 
 class DirectoryMetadataIndexer(DirectoryIndexer[DirectoryIntrinsicMetadataRow]):
     """Directory-level indexer
 
     This indexer is in charge of:
 
     - filtering directories already indexed in directory_intrinsic_metadata table
       with defined computation tool
     - retrieve all entry_files in directory
     - use metadata_detector for file_names containing metadata
     - compute metadata translation if necessary and possible (depends on tool)
     - send sha1s to content indexing if possible
     - store the results for directory
 
     """
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.config = merge_configs(DEFAULT_CONFIG, self.config)
 
     def filter(self, sha1_gits):
         """Filter out known sha1s and return only missing ones."""
         yield from self.idx_storage.directory_intrinsic_metadata_missing(
             (
                 {
                     "id": sha1_git,
                     "indexer_configuration_id": self.tool["id"],
                 }
                 for sha1_git in sha1_gits
             )
         )
 
     def index(
         self, id: Sha1Git, data: Optional[Directory] = None, **kwargs
     ) -> List[DirectoryIntrinsicMetadataRow]:
         """Index directory by processing it and organizing result.
 
         use metadata_detector to iterate on filenames, passes them to the content
         indexers, then merges (if more than one)
 
         Args:
           id: sha1_git of the directory
           data: should always be None
 
         Returns:
             dict: dictionary representing a directory_intrinsic_metadata, with
             keys:
 
             - id: directory's identifier (sha1_git)
             - indexer_configuration_id (bytes): tool used
             - metadata: dict of retrieved metadata
 
         """
         dir_: List[DirectoryLsEntry]
         assert data is None, "Unexpected directory object"
         dir_ = cast(
             List[DirectoryLsEntry],
             list(self.storage.directory_ls(id, recursive=False)),
         )
 
         try:
             if [entry["type"] for entry in dir_] == ["dir"]:
                 # If the root is just a single directory, recurse into it
                 # eg. PyPI packages, GNU tarballs
                 subdir = dir_[0]["target"]
                 dir_ = cast(
                     List[DirectoryLsEntry],
                     list(self.storage.directory_ls(subdir, recursive=False)),
                 )
             files = [entry for entry in dir_ if entry["type"] == "file"]
             (mappings, metadata) = self.translate_directory_intrinsic_metadata(
                 files,
                 log_suffix="directory=%s" % hashutil.hash_to_hex(id),
             )
         except Exception as e:
             self.log.exception("Problem when indexing dir: %r", e)
             sentry_sdk.capture_exception()
             return []
         return [
             DirectoryIntrinsicMetadataRow(
                 id=id,
                 indexer_configuration_id=self.tool["id"],
                 mappings=mappings,
                 metadata=metadata,
             )
         ]
 
     def persist_index_computations(
         self, results: List[DirectoryIntrinsicMetadataRow]
     ) -> Dict[str, int]:
         """Persist the results in storage."""
         # TODO: add functions in storage to keep data in
         # directory_intrinsic_metadata
         return self.idx_storage.directory_intrinsic_metadata_add(results)
 
     def translate_directory_intrinsic_metadata(
         self, files: List[DirectoryLsEntry], log_suffix: str
     ) -> Tuple[List[Any], Any]:
         """
         Determine plan of action to translate metadata in the given root directory
 
         Args:
             files: list of file entries, as returned by
               :meth:`swh.storage.interface.StorageInterface.directory_ls`
 
         Returns:
             (List[str], dict): list of mappings used and dict with
             translated metadata according to the CodeMeta vocabulary
 
         """
         metadata = []
         tool = {
             "name": "swh-metadata-translator",
             "version": "0.0.2",
             "configuration": {},
         }
         # TODO: iterate on each context, on each file
         # -> get raw_contents
         # -> translate each content
         config = {k: self.config[k] for k in [INDEXER_CFG_KEY, "objstorage", "storage"]}
         config["tools"] = [tool]
         all_detected_files = detect_metadata(files)
         used_mappings = [
             INTRINSIC_MAPPINGS[context].name for context in all_detected_files
         ]
         for (mapping_name, detected_files) in all_detected_files.items():
             cfg = deepcopy(config)
             cfg["tools"][0]["configuration"]["context"] = mapping_name
             c_metadata_indexer = ContentMetadataIndexer(config=cfg)
             # sha1s that are in content_metadata table
             sha1s_in_storage = []
             metadata_generator = self.idx_storage.content_metadata_get(detected_files)
             for c in metadata_generator:
                 # extracting metadata
                 sha1 = c.id
                 sha1s_in_storage.append(sha1)
                 local_metadata = c.metadata
                 # local metadata is aggregated
                 if local_metadata:
                     metadata.append(local_metadata)
 
             sha1s_filtered = [
                 item for item in detected_files if item not in sha1s_in_storage
             ]
 
             if sha1s_filtered:
                 # content indexing
                 try:
                     c_metadata_indexer.run(
                         sha1s_filtered,
                         log_suffix=log_suffix,
                     )
                     # on the fly possibility:
                     for result in c_metadata_indexer.results:
                         local_metadata = result.metadata
                         metadata.append(local_metadata)
 
                 except Exception:
                     self.log.exception("Exception while indexing metadata on contents")
                     sentry_sdk.capture_exception()
 
         metadata = merge_documents(metadata)
         return (used_mappings, metadata)
 
 
 class OriginMetadataIndexer(
     OriginIndexer[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]
 ):
     USE_TOOLS = False
 
     def __init__(self, config=None, **kwargs) -> None:
         super().__init__(config=config, **kwargs)
         self.directory_metadata_indexer = DirectoryMetadataIndexer(config=config)
 
     def index_list(
         self,
         origins: List[Origin],
         *,
         check_origin_known: bool = True,
         **kwargs,
     ) -> List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]]:
         head_rev_ids = []
         head_rel_ids = []
         origin_heads: Dict[Origin, CoreSWHID] = {}
 
         # Filter out origins not in the storage
         if check_origin_known:
             known_origins = list(
                 call_with_batches(
                     self.storage.origin_get,
                     [origin.url for origin in origins],
                     ORIGIN_GET_BATCH_SIZE,
                 )
             )
         else:
             known_origins = list(origins)
 
         for origin in known_origins:
             if origin is None:
                 continue
             head_swhid = get_head_swhid(self.storage, origin.url)
             if head_swhid:
                 origin_heads[origin] = head_swhid
                 if head_swhid.object_type == ObjectType.REVISION:
                     head_rev_ids.append(head_swhid.object_id)
                 elif head_swhid.object_type == ObjectType.RELEASE:
                     head_rel_ids.append(head_swhid.object_id)
                 else:
                     assert False, head_swhid
 
         head_revs = dict(
             zip(
                 head_rev_ids,
                 call_with_batches(
                     self.storage.revision_get, head_rev_ids, REVISION_GET_BATCH_SIZE
                 ),
             )
         )
         head_rels = dict(
             zip(
                 head_rel_ids,
                 call_with_batches(
                     self.storage.release_get, head_rel_ids, RELEASE_GET_BATCH_SIZE
                 ),
             )
         )
 
         results = []
         for (origin, head_swhid) in origin_heads.items():
             sentry_sdk.set_tag("swh-indexer-origin-url", origin.url)
             sentry_sdk.set_tag("swh-indexer-origin-head-swhid", str(head_swhid))
             if head_swhid.object_type == ObjectType.REVISION:
                 rev = head_revs[head_swhid.object_id]
                 if not rev:
                     self.log.warning(
                         "Missing head object %s of origin %r", head_swhid, origin.url
                     )
                     continue
                 directory_id = rev.directory
             elif head_swhid.object_type == ObjectType.RELEASE:
                 rel = head_rels[head_swhid.object_id]
                 if not rel:
                     self.log.warning(
                         "Missing head object %s of origin %r", head_swhid, origin.url
                     )
                     continue
                 if rel.target_type != ModelObjectType.DIRECTORY:
                     # TODO
                     self.log.warning(
                         "Head release %s of %r has unexpected target type %s",
                         head_swhid,
                         origin.url,
                         rel.target_type,
                     )
                     continue
                 assert rel.target, rel
                 directory_id = rel.target
             else:
                 assert False, head_swhid
 
             for dir_metadata in self.directory_metadata_indexer.index(directory_id):
                 # There is at most one dir_metadata
                 orig_metadata = OriginIntrinsicMetadataRow(
                     from_directory=dir_metadata.id,
                     id=origin.url,
                     metadata=dir_metadata.metadata,
                     mappings=dir_metadata.mappings,
                     indexer_configuration_id=dir_metadata.indexer_configuration_id,
                 )
                 results.append((orig_metadata, dir_metadata))
 
         return results
 
     def persist_index_computations(
         self,
         results: List[Tuple[OriginIntrinsicMetadataRow, DirectoryIntrinsicMetadataRow]],
     ) -> Dict[str, int]:
         # Deduplicate directories
-        dir_metadata: List[DirectoryIntrinsicMetadataRow] = []
-        orig_metadata: List[OriginIntrinsicMetadataRow] = []
+        dir_metadata: Dict[bytes, DirectoryIntrinsicMetadataRow] = {}
+        orig_metadata: Dict[str, OriginIntrinsicMetadataRow] = {}
         summary: Dict = {}
         for (orig_item, dir_item) in results:
             assert dir_item.metadata == orig_item.metadata
             if dir_item.metadata and not (dir_item.metadata.keys() <= {"@context"}):
                 # Only store non-empty metadata sets
-                if dir_item not in dir_metadata:
-                    dir_metadata.append(dir_item)
-                if orig_item not in orig_metadata:
-                    orig_metadata.append(orig_item)
+                if dir_item.id not in dir_metadata:
+                    dir_metadata[dir_item.id] = dir_item
+                if orig_item.id not in orig_metadata:
+                    orig_metadata[orig_item.id] = orig_item
 
         if dir_metadata:
             summary_dir = self.idx_storage.directory_intrinsic_metadata_add(
-                dir_metadata
+                list(dir_metadata.values())
             )
             summary.update(summary_dir)
         if orig_metadata:
-            summary_ori = self.idx_storage.origin_intrinsic_metadata_add(orig_metadata)
+            summary_ori = self.idx_storage.origin_intrinsic_metadata_add(
+                list(orig_metadata.values())
+            )
             summary.update(summary_ori)
 
         return summary
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
index 418c2ec..f6253d7 100644
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -1,348 +1,371 @@
 # Copyright (C) 2017-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import json
 import logging
 from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar
+import urllib.parse
 import uuid
 import xml.parsers.expat
 
 from pyld import jsonld
 import rdflib
 from typing_extensions import TypedDict
 import xmltodict
 import yaml
 
 from swh.indexer.codemeta import _document_loader, compact
 from swh.indexer.namespaces import RDF, SCHEMA
 from swh.indexer.storage.interface import Sha1
 
 
 class DirectoryLsEntry(TypedDict):
     target: Sha1
     sha1: Sha1
     name: bytes
     type: str
 
 
 TTranslateCallable = TypeVar(
     "TTranslateCallable",
     bound=Callable[[Any, rdflib.Graph, rdflib.term.BNode, Any], None],
 )
 
 
 def produce_terms(*uris: str) -> Callable[[TTranslateCallable], TTranslateCallable]:
     """Returns a decorator that marks the decorated function as adding
     the given terms to the ``translated_metadata`` dict"""
 
     def decorator(f: TTranslateCallable) -> TTranslateCallable:
         if not hasattr(f, "produced_terms"):
             f.produced_terms = []  # type: ignore
         f.produced_terms.extend(uris)  # type: ignore
         return f
 
     return decorator
 
 
 class BaseMapping:
     """Base class for :class:`BaseExtrinsicMapping` and :class:`BaseIntrinsicMapping`,
     not to be inherited directly."""
 
     def __init__(self, log_suffix=""):
         self.log_suffix = log_suffix
         self.log = logging.getLogger(
             "%s.%s" % (self.__class__.__module__, self.__class__.__name__)
         )
 
     @property
     def name(self):
         """A name of this mapping, used as an identifier in the
         indexer storage."""
         raise NotImplementedError(f"{self.__class__.__name__}.name")
 
     def translate(self, raw_content: bytes) -> Optional[Dict]:
         """
         Translates content by parsing content from a bytestring containing
         mapping-specific data and translating with the appropriate mapping
         to JSON-LD using the Codemeta and ForgeFed vocabularies.
 
         Args:
             raw_content: raw content to translate
 
         Returns:
             translated metadata in JSON friendly form needed for the content
             if parseable, :const:`None` otherwise.
 
         """
         raise NotImplementedError(f"{self.__class__.__name__}.translate")
 
     def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
         raise NotImplementedError(f"{self.__class__.__name__}.normalize_translation")
 
 
 class BaseExtrinsicMapping(BaseMapping):
     """Base class for extrinsic_metadata mappings to inherit from
 
     To implement a new mapping:
 
     - inherit this class
     - override translate function
     """
 
     @classmethod
     def extrinsic_metadata_formats(cls) -> Tuple[str, ...]:
         """
         Returns the list of extrinsic metadata formats which can be translated
         by this mapping
         """
         raise NotImplementedError(f"{cls.__name__}.extrinsic_metadata_formats")
 
     def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
         return compact(metadata, forgefed=True)
 
 
 class BaseIntrinsicMapping(BaseMapping):
     """Base class for intrinsic-metadata mappings to inherit from
 
     To implement a new mapping:
 
     - inherit this class
     - override translate function
     """
 
     @classmethod
     def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]:
         """
         Returns the sha1 hashes of files which can be translated by this mapping
         """
         raise NotImplementedError(f"{cls.__name__}.detect_metadata_files")
 
     def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
         return compact(metadata, forgefed=False)
 
 
 class SingleFileIntrinsicMapping(BaseIntrinsicMapping):
     """Base class for all intrinsic metadata mappings that use a single file as input."""
 
     @property
     def filename(self):
         """The .json file to extract metadata from."""
         raise NotImplementedError(f"{self.__class__.__name__}.filename")
 
     @classmethod
     def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]:
         for entry in file_entries:
             if entry["name"].lower() == cls.filename:
                 return [entry["sha1"]]
         return []
 
 
 class DictMapping(BaseMapping):
     """Base class for mappings that take as input a file that is mostly
     a key-value store (eg. a shallow JSON dict)."""
 
     string_fields: List[str] = []
     """List of fields that are simple strings, and don't need any
     normalization."""
 
     uri_fields: List[str] = []
     """List of fields that are simple URIs, and don't need any
     normalization."""
 
     @property
     def mapping(self):
         """A translation dict to map dict keys into a canonical name."""
         raise NotImplementedError(f"{self.__class__.__name__}.mapping")
 
     @staticmethod
     def _normalize_method_name(name: str) -> str:
         return name.replace("-", "_")
 
     @classmethod
     def supported_terms(cls):
         # one-to-one mapping from the original key to a CodeMeta term
         simple_terms = {
             str(term)
             for (key, term) in cls.mapping.items()
             if key in cls.string_fields + cls.uri_fields
             or hasattr(cls, "normalize_" + cls._normalize_method_name(key))
         }
 
         # more complex mapping from the original key to JSON-LD
         complex_terms = {
             str(term)
             for meth_name in dir(cls)
             if meth_name.startswith("translate_")
             for term in getattr(getattr(cls, meth_name), "produced_terms", [])
         }
 
         return simple_terms | complex_terms
 
     def _translate_dict(self, content_dict: Dict) -> Dict[str, Any]:
         """
         Translates content  by parsing content from a dict object
         and translating with the appropriate mapping
 
         Args:
             content_dict (dict): content dict to translate
 
         Returns:
             dict: translated metadata in json-friendly form needed for
             the indexer
 
         """
         graph = rdflib.Graph()
 
         # The main object being described (the SoftwareSourceCode) does not necessarily
         # may or may not have an id.
         # Either way, we temporarily use this URI to identify it. Unfortunately,
         # we cannot use a blank node as we need to use it for JSON-LD framing later,
         # and blank nodes cannot be used for framing in JSON-LD >= 1.1
         root_id = (
             "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/"
             + str(uuid.uuid4())
         )
         root = rdflib.URIRef(root_id)
         graph.add((root, RDF.type, SCHEMA.SoftwareSourceCode))
 
         for k, v in content_dict.items():
             # First, check if there is a specific translation
             # method for this key
             translation_method = getattr(
                 self, "translate_" + self._normalize_method_name(k), None
             )
             if translation_method:
                 translation_method(graph, root, v)
             elif k in self.mapping:
                 # if there is no method, but the key is known from the
                 # crosswalk table
                 codemeta_key = self.mapping[k]
 
                 # if there is a normalization method, use it on the value,
                 # and add its results to the triples
                 normalization_method = getattr(
                     self, "normalize_" + self._normalize_method_name(k), None
                 )
                 if normalization_method:
                     v = normalization_method(v)
                     if v is None:
                         pass
                     elif isinstance(v, list):
                         for item in reversed(v):
                             graph.add((root, codemeta_key, item))
                     else:
                         graph.add((root, codemeta_key, v))
                 elif k in self.string_fields and isinstance(v, str):
                     graph.add((root, codemeta_key, rdflib.Literal(v)))
                 elif k in self.string_fields and isinstance(v, list):
                     for item in v:
                         graph.add((root, codemeta_key, rdflib.Literal(item)))
                 elif k in self.uri_fields and isinstance(v, str):
-                    graph.add((root, codemeta_key, rdflib.URIRef(v)))
+                    # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
+                    # URLs that are blatantly invalid early, so PyLD does not crash.
+                    parsed_url = urllib.parse.urlparse(v)
+                    if parsed_url.netloc:
+                        graph.add((root, codemeta_key, rdflib.URIRef(v)))
                 elif k in self.uri_fields and isinstance(v, list):
                     for item in v:
                         if isinstance(item, str):
-                            graph.add((root, codemeta_key, rdflib.URIRef(item)))
+                            # ditto
+                            parsed_url = urllib.parse.urlparse(item)
+                            if parsed_url.netloc:
+                                graph.add((root, codemeta_key, rdflib.URIRef(item)))
                 else:
                     continue
 
         self.extra_translation(graph, root, content_dict)
 
+        self.sanitize(graph)
+
         # Convert from rdflib's internal graph representation to JSON
         s = graph.serialize(format="application/ld+json")
 
         # Load from JSON to a list of Python objects
         jsonld_graph = json.loads(s)
 
         # Use JSON-LD framing to turn the graph into a rooted tree
         # frame = {"@type": str(SCHEMA.SoftwareSourceCode)}
         translated_metadata = jsonld.frame(
             jsonld_graph,
             {"@id": root_id},
             options={
                 "documentLoader": _document_loader,
                 "processingMode": "json-ld-1.1",
             },
         )
 
         # Remove the temporary id we added at the beginning
         if isinstance(translated_metadata["@id"], list):
             translated_metadata["@id"].remove(root_id)
         else:
             del translated_metadata["@id"]
 
         return self.normalize_translation(translated_metadata)
 
+    def sanitize(self, graph: rdflib.Graph) -> None:
+        # Remove triples that make PyLD crash
+        for (subject, predicate, _) in graph.triples((None, None, rdflib.URIRef(""))):
+            graph.remove((subject, predicate, rdflib.URIRef("")))
+
+        # Should not happen, but we's better check as this may lead to incorrect data
+        invalid = False
+        for triple in graph.triples((rdflib.URIRef(""), None, None)):
+            invalid = True
+            logging.error("Empty triple subject URI: %r", triple)
+        if invalid:
+            raise ValueError("Empty triple subject(s)")
+
     def extra_translation(
         self, graph: rdflib.Graph, root: rdflib.term.Node, d: Dict[str, Any]
-    ):
+    ) -> None:
         """Called at the end of the translation process, and may add arbitrary triples
         to ``graph`` based on the input dictionary (passed as ``d``).
         """
         pass
 
 
 class JsonMapping(DictMapping):
     """Base class for all mappings that use JSON data as input."""
 
     def translate(self, raw_content: bytes) -> Optional[Dict]:
         try:
             raw_content_string: str = raw_content.decode()
         except UnicodeDecodeError:
             self.log.warning("Error unidecoding from %s", self.log_suffix)
             return None
         try:
             content_dict = json.loads(raw_content_string)
         except json.JSONDecodeError:
             self.log.warning("Error unjsoning from %s", self.log_suffix)
             return None
         if isinstance(content_dict, dict):
             return self._translate_dict(content_dict)
         return None
 
 
 class XmlMapping(DictMapping):
     """Base class for all mappings that use XML data as input."""
 
     def translate(self, raw_content: bytes) -> Optional[Dict]:
         try:
             d = xmltodict.parse(raw_content)
         except xml.parsers.expat.ExpatError:
             self.log.warning("Error parsing XML from %s", self.log_suffix)
             return None
         except UnicodeDecodeError:
             self.log.warning("Error unidecoding XML from %s", self.log_suffix)
             return None
         except (LookupError, ValueError):
             # unknown encoding or multi-byte encoding
             self.log.warning("Error detecting XML encoding from %s", self.log_suffix)
             return None
         if not isinstance(d, dict):
             self.log.warning("Skipping ill-formed XML content: %s", raw_content)
             return None
         return self._translate_dict(d)
 
 
 class SafeLoader(yaml.SafeLoader):
     yaml_implicit_resolvers = {
         k: [r for r in v if r[0] != "tag:yaml.org,2002:timestamp"]
         for k, v in yaml.SafeLoader.yaml_implicit_resolvers.items()
     }
 
 
 class YamlMapping(DictMapping, SingleFileIntrinsicMapping):
     """Base class for all mappings that use Yaml data as input."""
 
     def translate(self, raw_content: bytes) -> Optional[Dict[str, str]]:
         raw_content_string: str = raw_content.decode()
         try:
             content_dict = yaml.load(raw_content_string, Loader=SafeLoader)
         except yaml.scanner.ScannerError:
             return None
 
         if isinstance(content_dict, dict):
             return self._translate_dict(content_dict)
 
         return None
diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py
index fe3b87e..d8d8702 100644
--- a/swh/indexer/metadata_dictionary/github.py
+++ b/swh/indexer/metadata_dictionary/github.py
@@ -1,113 +1,117 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from typing import Any, Tuple
 
 from rdflib import RDF, BNode, Graph, Literal, URIRef
 
 from swh.indexer.codemeta import CROSSWALK_TABLE
 from swh.indexer.namespaces import ACTIVITYSTREAMS, FORGEFED, SCHEMA
 
 from .base import BaseExtrinsicMapping, JsonMapping, produce_terms
 from .utils import prettyprint_graph  # noqa
 
 SPDX = URIRef("https://spdx.org/licenses/")
 
 
 class GitHubMapping(BaseExtrinsicMapping, JsonMapping):
     name = "github"
-    mapping = CROSSWALK_TABLE["GitHub"]
+    mapping = {
+        **CROSSWALK_TABLE["GitHub"],
+        "topics": SCHEMA.keywords,  # TODO: submit this to the official crosswalk
+    }
     string_fields = [
         "archive_url",
         "created_at",
         "updated_at",
         "description",
         "full_name",
         "html_url",
         "issues_url",
+        "topics",
     ]
 
     @classmethod
     def extrinsic_metadata_formats(cls) -> Tuple[str, ...]:
         return ("application/vnd.github.v3+json",)
 
     def extra_translation(self, graph, root, content_dict):
         graph.remove((root, RDF.type, SCHEMA.SoftwareSourceCode))
         graph.add((root, RDF.type, FORGEFED.Repository))
 
     @produce_terms(FORGEFED.forks, ACTIVITYSTREAMS.totalItems)
     def translate_forks_count(self, graph: Graph, root: BNode, v: Any) -> None:
         """
 
         >>> graph = Graph()
         >>> root = URIRef("http://example.org/test-software")
         >>> GitHubMapping().translate_forks_count(graph, root, 42)
         >>> prettyprint_graph(graph, root)
         {
             "@id": ...,
             "https://forgefed.org/ns#forks": {
                 "@type": "https://www.w3.org/ns/activitystreams#OrderedCollection",
                 "https://www.w3.org/ns/activitystreams#totalItems": 42
             }
         }
         """
         if isinstance(v, int):
             collection = BNode()
             graph.add((root, FORGEFED.forks, collection))
             graph.add((collection, RDF.type, ACTIVITYSTREAMS.OrderedCollection))
             graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))
 
     @produce_terms(ACTIVITYSTREAMS.likes, ACTIVITYSTREAMS.totalItems)
     def translate_stargazers_count(self, graph: Graph, root: BNode, v: Any) -> None:
         """
 
         >>> graph = Graph()
         >>> root = URIRef("http://example.org/test-software")
         >>> GitHubMapping().translate_stargazers_count(graph, root, 42)
         >>> prettyprint_graph(graph, root)
         {
             "@id": ...,
             "https://www.w3.org/ns/activitystreams#likes": {
                 "@type": "https://www.w3.org/ns/activitystreams#Collection",
                 "https://www.w3.org/ns/activitystreams#totalItems": 42
             }
         }
         """
         if isinstance(v, int):
             collection = BNode()
             graph.add((root, ACTIVITYSTREAMS.likes, collection))
             graph.add((collection, RDF.type, ACTIVITYSTREAMS.Collection))
             graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))
 
     @produce_terms(ACTIVITYSTREAMS.followers, ACTIVITYSTREAMS.totalItems)
     def translate_watchers_count(self, graph: Graph, root: BNode, v: Any) -> None:
         """
 
         >>> graph = Graph()
         >>> root = URIRef("http://example.org/test-software")
         >>> GitHubMapping().translate_watchers_count(graph, root, 42)
         >>> prettyprint_graph(graph, root)
         {
             "@id": ...,
             "https://www.w3.org/ns/activitystreams#followers": {
                 "@type": "https://www.w3.org/ns/activitystreams#Collection",
                 "https://www.w3.org/ns/activitystreams#totalItems": 42
             }
         }
         """
         if isinstance(v, int):
             collection = BNode()
             graph.add((root, ACTIVITYSTREAMS.followers, collection))
             graph.add((collection, RDF.type, ACTIVITYSTREAMS.Collection))
             graph.add((collection, ACTIVITYSTREAMS.totalItems, Literal(v)))
 
     def normalize_license(self, d):
         """
 
         >>> GitHubMapping().normalize_license({'spdx_id': 'MIT'})
         rdflib.term.URIRef('https://spdx.org/licenses/MIT')
         """
         if isinstance(d, dict) and isinstance(d.get("spdx_id"), str):
             return SPDX + d["spdx_id"]
diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
index a374a5e..8b3e48d 100644
--- a/swh/indexer/metadata_dictionary/maven.py
+++ b/swh/indexer/metadata_dictionary/maven.py
@@ -1,159 +1,162 @@
 # Copyright (C) 2018-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import os
 from typing import Any, Dict
 
 from rdflib import Graph, Literal, URIRef
 
 from swh.indexer.codemeta import CROSSWALK_TABLE
 from swh.indexer.namespaces import SCHEMA
 
 from .base import SingleFileIntrinsicMapping, XmlMapping
 from .utils import prettyprint_graph  # noqa
 
 
 class MavenMapping(XmlMapping, SingleFileIntrinsicMapping):
     """
     dedicated class for Maven (pom.xml) mapping and translation
     """
 
     name = "maven"
     filename = b"pom.xml"
     mapping = CROSSWALK_TABLE["Java (Maven)"]
     string_fields = ["name", "version", "description", "email"]
 
     _default_repository = {"url": "https://repo.maven.apache.org/maven2/"}
 
     def _translate_dict(self, d: Dict[str, Any]) -> Dict[str, Any]:
         return super()._translate_dict(d.get("project") or {})
 
     def extra_translation(self, graph: Graph, root, d):
         self.parse_repositories(graph, root, d)
 
     def parse_repositories(self, graph: Graph, root, d):
         """https://maven.apache.org/pom.html#Repositories
 
         >>> import rdflib
         >>> import xmltodict
         >>> from pprint import pprint
         >>> d = xmltodict.parse('''
         ... <repositories>
         ...   <repository>
         ...     <id>codehausSnapshots</id>
         ...     <name>Codehaus Snapshots</name>
         ...     <url>http://snapshots.maven.codehaus.org/maven2</url>
         ...     <layout>default</layout>
         ...   </repository>
         ... </repositories>
         ... ''')
         >>> MavenMapping().parse_repositories(rdflib.Graph(), rdflib.BNode(), d)
         """
         repositories = d.get("repositories")
         if not repositories:
             self.parse_repository(graph, root, d, self._default_repository)
         elif isinstance(repositories, dict):
             repositories = repositories.get("repository") or []
             if not isinstance(repositories, list):
                 repositories = [repositories]
             for repo in repositories:
                 self.parse_repository(graph, root, d, repo)
 
     def parse_repository(self, graph: Graph, root, d, repo):
         if not isinstance(repo, dict):
             return
         if repo.get("layout", "default") != "default":
             return  # TODO ?
         url = repo.get("url")
         group_id = d.get("groupId")
         artifact_id = d.get("artifactId")
         if (
             isinstance(url, str)
             and isinstance(group_id, str)
             and isinstance(artifact_id, str)
         ):
             repo = os.path.join(url, *group_id.split("."), artifact_id)
+            if "${" in repo:
+                # Often use as templating in pom.xml files collected from VCSs
+                return
             graph.add((root, SCHEMA.codeRepository, URIRef(repo)))
 
     def normalize_groupId(self, id_):
         """https://maven.apache.org/pom.html#Maven_Coordinates
 
         >>> MavenMapping().normalize_groupId('org.example')
         rdflib.term.Literal('org.example')
         """
         if isinstance(id_, str):
             return Literal(id_)
 
     def translate_licenses(self, graph, root, licenses):
         """https://maven.apache.org/pom.html#Licenses
 
         >>> import xmltodict
         >>> import json
         >>> d = xmltodict.parse('''
         ... <licenses>
         ...   <license>
         ...     <name>Apache License, Version 2.0</name>
         ...     <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
         ...   </license>
         ... </licenses>
         ... ''')
         >>> print(json.dumps(d, indent=4))
         {
             "licenses": {
                 "license": {
                     "name": "Apache License, Version 2.0",
                     "url": "https://www.apache.org/licenses/LICENSE-2.0.txt"
                 }
             }
         }
         >>> graph = Graph()
         >>> root = URIRef("http://example.org/test-software")
         >>> MavenMapping().translate_licenses(graph, root, d["licenses"])
         >>> prettyprint_graph(graph, root)
         {
             "@id": ...,
             "http://schema.org/license": {
                 "@id": "https://www.apache.org/licenses/LICENSE-2.0.txt"
             }
         }
 
         or, if there are more than one license:
 
         >>> import xmltodict
         >>> from pprint import pprint
         >>> d = xmltodict.parse('''
         ... <licenses>
         ...   <license>
         ...     <name>Apache License, Version 2.0</name>
         ...     <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
         ...   </license>
         ...   <license>
         ...     <name>MIT License</name>
         ...     <url>https://opensource.org/licenses/MIT</url>
         ...   </license>
         ... </licenses>
         ... ''')
         >>> graph = Graph()
         >>> root = URIRef("http://example.org/test-software")
         >>> MavenMapping().translate_licenses(graph, root, d["licenses"])
         >>> pprint(set(graph.triples((root, URIRef("http://schema.org/license"), None))))
         {(rdflib.term.URIRef('http://example.org/test-software'),
           rdflib.term.URIRef('http://schema.org/license'),
           rdflib.term.URIRef('https://opensource.org/licenses/MIT')),
          (rdflib.term.URIRef('http://example.org/test-software'),
           rdflib.term.URIRef('http://schema.org/license'),
           rdflib.term.URIRef('https://www.apache.org/licenses/LICENSE-2.0.txt'))}
         """
 
         if not isinstance(licenses, dict):
             return
         licenses = licenses.get("license")
         if isinstance(licenses, dict):
             licenses = [licenses]
         elif not isinstance(licenses, list):
             return
         for license in licenses:
             if isinstance(license, dict) and isinstance(license.get("url"), str):
                 graph.add((root, SCHEMA.license, URIRef(license["url"])))
diff --git a/swh/indexer/metadata_dictionary/npm.py b/swh/indexer/metadata_dictionary/npm.py
index 1540ef6..f2eaa64 100644
--- a/swh/indexer/metadata_dictionary/npm.py
+++ b/swh/indexer/metadata_dictionary/npm.py
@@ -1,282 +1,292 @@
 # Copyright (C) 2018-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import re
 import urllib.parse
 
 from rdflib import RDF, BNode, Graph, Literal, URIRef
 
 from swh.indexer.codemeta import CROSSWALK_TABLE
 from swh.indexer.namespaces import SCHEMA
 
 from .base import JsonMapping, SingleFileIntrinsicMapping
 from .utils import add_list, prettyprint_graph  # noqa
 
 SPDX = URIRef("https://spdx.org/licenses/")
 
 
 class NpmMapping(JsonMapping, SingleFileIntrinsicMapping):
     """
     dedicated class for NPM (package.json) mapping and translation
     """
 
     name = "npm"
     mapping = CROSSWALK_TABLE["NodeJS"]
     filename = b"package.json"
     string_fields = ["name", "version", "description", "email"]
     uri_fields = ["homepage"]
 
     _schema_shortcuts = {
         "github": "git+https://github.com/%s.git",
         "gist": "git+https://gist.github.com/%s.git",
         "gitlab": "git+https://gitlab.com/%s.git",
         # Bitbucket supports both hg and git, and the shortcut does not
         # tell which one to use.
         # 'bitbucket': 'https://bitbucket.org/',
     }
 
     def normalize_repository(self, d):
         """https://docs.npmjs.com/files/package.json#repository
 
         >>> NpmMapping().normalize_repository({
         ...     'type': 'git',
         ...     'url': 'https://example.org/foo.git'
         ... })
         rdflib.term.URIRef('git+https://example.org/foo.git')
         >>> NpmMapping().normalize_repository(
         ...     'gitlab:foo/bar')
         rdflib.term.URIRef('git+https://gitlab.com/foo/bar.git')
         >>> NpmMapping().normalize_repository(
         ...     'foo/bar')
         rdflib.term.URIRef('git+https://github.com/foo/bar.git')
         """
         if (
             isinstance(d, dict)
             and isinstance(d.get("type"), str)
             and isinstance(d.get("url"), str)
         ):
             url = "{type}+{url}".format(**d)
         elif isinstance(d, str):
             if "://" in d:
                 url = d
             elif ":" in d:
                 (schema, rest) = d.split(":", 1)
                 if schema in self._schema_shortcuts:
                     url = self._schema_shortcuts[schema] % rest
                 else:
                     return None
             else:
                 url = self._schema_shortcuts["github"] % d
 
         else:
             return None
 
         return URIRef(url)
 
     def normalize_bugs(self, d):
         """https://docs.npmjs.com/files/package.json#bugs
 
         >>> NpmMapping().normalize_bugs({
         ...     'url': 'https://example.org/bugs/',
         ...     'email': 'bugs@example.org'
         ... })
         rdflib.term.URIRef('https://example.org/bugs/')
         >>> NpmMapping().normalize_bugs(
         ...     'https://example.org/bugs/')
         rdflib.term.URIRef('https://example.org/bugs/')
         """
         if isinstance(d, dict) and isinstance(d.get("url"), str):
             return URIRef(d["url"])
         elif isinstance(d, str):
             return URIRef(d)
         else:
             return None
 
     _parse_author = re.compile(
         r"^ *" r"(?P<name>.*?)" r"( +<(?P<email>.*)>)?" r"( +\((?P<url>.*)\))?" r" *$"
     )
 
     def translate_author(self, graph: Graph, root, d):
         r"""https://docs.npmjs.com/files/package.json#people-fields-author-contributors'
 
         >>> from pprint import pprint
         >>> root = URIRef("http://example.org/test-software")
         >>> graph = Graph()
         >>> NpmMapping().translate_author(graph, root, {
         ...     'name': 'John Doe',
         ...     'email': 'john.doe@example.org',
         ...     'url': 'https://example.org/~john.doe',
         ... })
         >>> prettyprint_graph(graph, root)
         {
             "@id": ...,
             "http://schema.org/author": {
                 "@list": [
                     {
                         "@type": "http://schema.org/Person",
                         "http://schema.org/email": "john.doe@example.org",
                         "http://schema.org/name": "John Doe",
                         "http://schema.org/url": {
                             "@id": "https://example.org/~john.doe"
                         }
                     }
                 ]
             }
         }
         >>> graph = Graph()
         >>> NpmMapping().translate_author(graph, root,
         ...     'John Doe <john.doe@example.org> (https://example.org/~john.doe)'
         ... )
         >>> prettyprint_graph(graph, root)
         {
             "@id": ...,
             "http://schema.org/author": {
                 "@list": [
                     {
                         "@type": "http://schema.org/Person",
                         "http://schema.org/email": "john.doe@example.org",
                         "http://schema.org/name": "John Doe",
                         "http://schema.org/url": {
                             "@id": "https://example.org/~john.doe"
                         }
                     }
                 ]
             }
         }
         >>> graph = Graph()
         >>> NpmMapping().translate_author(graph, root, {
         ...     'name': 'John Doe',
         ...     'email': 'john.doe@example.org',
         ...     'url': 'https:\\\\example.invalid/~john.doe',
         ... })
         >>> prettyprint_graph(graph, root)
         {
             "@id": ...,
             "http://schema.org/author": {
                 "@list": [
                     {
                         "@type": "http://schema.org/Person",
                         "http://schema.org/email": "john.doe@example.org",
                         "http://schema.org/name": "John Doe"
                     }
                 ]
             }
         }
         """  # noqa
         author = BNode()
         graph.add((author, RDF.type, SCHEMA.Person))
         if isinstance(d, dict):
             name = d.get("name", None)
             email = d.get("email", None)
             url = d.get("url", None)
         elif isinstance(d, str):
             match = self._parse_author.match(d)
             if not match:
                 return None
             name = match.group("name")
             email = match.group("email")
             url = match.group("url")
         else:
             return None
 
         if name and isinstance(name, str):
             graph.add((author, SCHEMA.name, Literal(name)))
         if email and isinstance(email, str):
             graph.add((author, SCHEMA.email, Literal(email)))
         if url and isinstance(url, str):
             # Workaround for https://github.com/digitalbazaar/pyld/issues/91 : drop
             # URLs that are blatantly invalid early, so PyLD does not crash.
             parsed_url = urllib.parse.urlparse(url)
             if parsed_url.netloc:
                 graph.add((author, SCHEMA.url, URIRef(url)))
 
         add_list(graph, root, SCHEMA.author, [author])
 
     def normalize_description(self, description):
         r"""Try to re-decode ``description`` as UTF-16, as this is a somewhat common
         mistake that causes issues in the database because of null bytes in JSON.
 
         >>> NpmMapping().normalize_description("foo bar")
         rdflib.term.Literal('foo bar')
         >>> NpmMapping().normalize_description(
         ...     "\ufffd\ufffd#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 \x00"
         ... )
         rdflib.term.Literal('foo bar')
         >>> NpmMapping().normalize_description(
         ...     "\ufffd\ufffd\x00#\x00 \x00f\x00o\x00o\x00 \x00b\x00a\x00r\x00\r\x00 "
         ... )
         rdflib.term.Literal('foo bar')
         >>> NpmMapping().normalize_description(
         ...     # invalid UTF-16 and meaningless UTF-8:
         ...     "\ufffd\ufffd\x00#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00"
         ... ) is None
         True
         >>> NpmMapping().normalize_description(
         ...     # ditto (ut looks like little-endian at first)
         ...     "\ufffd\ufffd#\x00\x00\x00 \x00\x00\x00\x00f\x00\x00\x00\x00\x00"
         ... ) is None
         True
         >>> NpmMapping().normalize_description(None) is None
         True
         """
         if not isinstance(description, str):
             return None
         # XXX: if this function ever need to support more cases, consider
         # switching to https://pypi.org/project/ftfy/ instead of adding more hacks
         if description.startswith("\ufffd\ufffd") and "\x00" in description:
             # 2 unicode replacement characters followed by '# ' encoded as UTF-16
             # is a common mistake, which indicates a README.md was saved as UTF-16,
             # and some NPM tool opened it as UTF-8 and used the first line as
             # description.
 
             description_bytes = description.encode()
 
             # Strip the the two unicode replacement characters
             assert description_bytes.startswith(b"\xef\xbf\xbd\xef\xbf\xbd")
             description_bytes = description_bytes[6:]
 
             # If the following attempts fail to recover the description, discard it
             # entirely because the current indexer storage backend (postgresql) cannot
             # store zero bytes in JSON columns.
             description = None
 
             if not description_bytes.startswith(b"\x00"):
                 # try UTF-16 little-endian (the most common) first
                 try:
                     description = description_bytes.decode("utf-16le")
                 except UnicodeDecodeError:
                     pass
             if description is None:
                 # if it fails, try UTF-16 big-endian
                 try:
                     description = description_bytes.decode("utf-16be")
                 except UnicodeDecodeError:
                     pass
 
             if description:
                 if description.startswith("# "):
                     description = description[2:]
                 return Literal(description.rstrip())
             else:
                 return None
         return Literal(description)
 
     def normalize_license(self, s):
         """https://docs.npmjs.com/files/package.json#license
 
         >>> NpmMapping().normalize_license('MIT')
         rdflib.term.URIRef('https://spdx.org/licenses/MIT')
         """
         if isinstance(s, str):
+            if s.startswith("SEE LICENSE IN "):
+                # Very common pattern, because it is an example in the specification.
+                # It is followed by the filename; and the indexer architecture currently
+                # does not allow accessing that from metadata mappings.
+                # (Plus, an hypothetical license mapping would eventually pick it up)
+                return
+            if " " in s:
+                # Either an SPDX expression, or unusable data
+                # TODO: handle it
+                return
             return SPDX + s
 
     def normalize_keywords(self, lst):
         """https://docs.npmjs.com/files/package.json#homepage
 
         >>> NpmMapping().normalize_keywords(['foo', 'bar'])
         [rdflib.term.Literal('foo'), rdflib.term.Literal('bar')]
         """
         if isinstance(lst, list):
             return [Literal(x) for x in lst if isinstance(x, str)]
diff --git a/swh/indexer/tests/metadata_dictionary/test_github.py b/swh/indexer/tests/metadata_dictionary/test_github.py
index c0592dc..3085bcc 100644
--- a/swh/indexer/tests/metadata_dictionary/test_github.py
+++ b/swh/indexer/tests/metadata_dictionary/test_github.py
@@ -1,142 +1,156 @@
 # Copyright (C) 2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from swh.indexer.metadata_dictionary import MAPPINGS
 
 CONTEXT = [
     "https://doi.org/10.5063/schema/codemeta-2.0",
     {
         "as": "https://www.w3.org/ns/activitystreams#",
         "forge": "https://forgefed.org/ns#",
     },
 ]
 
 
 def test_compute_metadata_none():
     """
     testing content empty content is empty
     should return None
     """
     content = b""
 
     # None if no metadata was found or an error occurred
     declared_metadata = None
     result = MAPPINGS["GitHubMapping"]().translate(content)
     assert declared_metadata == result
 
 
 def test_supported_terms():
     terms = MAPPINGS["GitHubMapping"].supported_terms()
     assert {
         "http://schema.org/name",
         "http://schema.org/license",
         "https://forgefed.org/ns#forks",
         "https://www.w3.org/ns/activitystreams#totalItems",
     } <= terms
 
 
 def test_compute_metadata_github():
-    """
-    testing only computation of metadata with hard_mapping_npm
-    """
     content = b"""
 {
   "id": 80521091,
   "node_id": "MDEwOlJlcG9zaXRvcnk4MDUyMTA5MQ==",
   "name": "swh-indexer",
   "full_name": "SoftwareHeritage/swh-indexer",
   "private": false,
   "owner": {
     "login": "SoftwareHeritage",
     "id": 18555939,
     "node_id": "MDEyOk9yZ2FuaXphdGlvbjE4NTU1OTM5",
     "avatar_url": "https://avatars.githubusercontent.com/u/18555939?v=4",
     "gravatar_id": "",
     "url": "https://api.github.com/users/SoftwareHeritage",
     "type": "Organization",
     "site_admin": false
   },
   "html_url": "https://github.com/SoftwareHeritage/swh-indexer",
   "description": "GitHub mirror of Metadata indexer",
   "fork": false,
   "url": "https://api.github.com/repos/SoftwareHeritage/swh-indexer",
   "created_at": "2017-01-31T13:05:39Z",
   "updated_at": "2022-06-22T08:02:20Z",
   "pushed_at": "2022-06-29T09:01:08Z",
   "git_url": "git://github.com/SoftwareHeritage/swh-indexer.git",
   "ssh_url": "git@github.com:SoftwareHeritage/swh-indexer.git",
   "clone_url": "https://github.com/SoftwareHeritage/swh-indexer.git",
   "svn_url": "https://github.com/SoftwareHeritage/swh-indexer",
   "homepage": "https://forge.softwareheritage.org/source/swh-indexer/",
   "size": 2713,
   "stargazers_count": 13,
   "watchers_count": 12,
   "language": "Python",
   "has_issues": false,
   "has_projects": false,
   "has_downloads": true,
   "has_wiki": false,
   "has_pages": false,
   "forks_count": 1,
   "mirror_url": null,
   "archived": false,
   "disabled": false,
   "open_issues_count": 0,
   "license": {
     "key": "gpl-3.0",
     "name": "GNU General Public License v3.0",
     "spdx_id": "GPL-3.0",
     "url": "https://api.github.com/licenses/gpl-3.0",
     "node_id": "MDc6TGljZW5zZTk="
   },
   "allow_forking": true,
   "is_template": false,
   "web_commit_signoff_required": false,
   "topics": [
 
   ],
   "visibility": "public",
   "forks": 1,
   "open_issues": 0,
   "watchers": 13,
   "default_branch": "master",
   "temp_clone_token": null,
   "organization": {
     "login": "SoftwareHeritage",
     "id": 18555939,
     "node_id": "MDEyOk9yZ2FuaXphdGlvbjE4NTU1OTM5",
     "avatar_url": "https://avatars.githubusercontent.com/u/18555939?v=4",
     "gravatar_id": "",
     "type": "Organization",
     "site_admin": false
   },
   "network_count": 1,
   "subscribers_count": 6
 }
 
     """
     result = MAPPINGS["GitHubMapping"]().translate(content)
     assert result == {
         "@context": CONTEXT,
         "type": "forge:Repository",
         "forge:forks": {
             "as:totalItems": 1,
             "type": "as:OrderedCollection",
         },
         "as:likes": {
             "as:totalItems": 13,
             "type": "as:Collection",
         },
         "as:followers": {
             "as:totalItems": 12,
             "type": "as:Collection",
         },
         "license": "https://spdx.org/licenses/GPL-3.0",
         "name": "SoftwareHeritage/swh-indexer",
         "description": "GitHub mirror of Metadata indexer",
         "schema:codeRepository": "https://github.com/SoftwareHeritage/swh-indexer",
         "schema:dateCreated": "2017-01-31T13:05:39Z",
         "schema:dateModified": "2022-06-22T08:02:20Z",
     }
+
+
+def test_github_topics():
+    content = b"""
+{
+  "topics": [
+    "foo",
+    "bar"
+  ]
+}
+    """
+    result = MAPPINGS["GitHubMapping"]().translate(content)
+    assert set(result.pop("keywords", [])) == {"foo", "bar"}, result
+    assert result == {
+        "@context": CONTEXT,
+        "type": "forge:Repository",
+    }
diff --git a/swh/indexer/tests/metadata_dictionary/test_maven.py b/swh/indexer/tests/metadata_dictionary/test_maven.py
index 0267e95..afde286 100644
--- a/swh/indexer/tests/metadata_dictionary/test_maven.py
+++ b/swh/indexer/tests/metadata_dictionary/test_maven.py
@@ -1,365 +1,406 @@
 # Copyright (C) 2017-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import logging
 
 from hypothesis import HealthCheck, given, settings
 
 from swh.indexer.metadata_dictionary import MAPPINGS
 
 from ..utils import xml_document_strategy
 
 
 def test_compute_metadata_maven():
     raw_content = b"""
     <project>
       <name>Maven Default Project</name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version>1.2.3</version>
       <repositories>
         <repository>
           <id>central</id>
           <name>Maven Repository Switchboard</name>
           <layout>default</layout>
           <url>http://repo1.maven.org/maven2</url>
           <snapshots>
             <enabled>false</enabled>
           </snapshots>
         </repository>
       </repositories>
       <licenses>
         <license>
           <name>Apache License, Version 2.0</name>
           <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
           <distribution>repo</distribution>
           <comments>A business-friendly OSS license</comments>
         </license>
       </licenses>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "Maven Default Project",
         "schema:identifier": "com.mycompany.app",
         "version": "1.2.3",
         "license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
         "codeRepository": ("http://repo1.maven.org/maven2/com/mycompany/app/my-app"),
     }
 
 
 def test_compute_metadata_maven_empty():
     raw_content = b"""
     <project>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
     }
 
 
 def test_compute_metadata_maven_almost_empty():
     raw_content = b"""
     <project>
       <foo/>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
     }
 
 
 def test_compute_metadata_maven_invalid_xml(caplog):
     expected_warning = (
         "swh.indexer.metadata_dictionary.maven.MavenMapping",
         logging.WARNING,
         "Error parsing XML from foo",
     )
     caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
 
     raw_content = b"""
     <project>"""
     caplog.clear()
     result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
     assert caplog.record_tuples == [expected_warning], result
     assert result is None
 
     raw_content = b"""
     """
     caplog.clear()
     result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
     assert caplog.record_tuples == [expected_warning], result
     assert result is None
 
 
 def test_compute_metadata_maven_unknown_encoding(caplog):
     expected_warning = (
         "swh.indexer.metadata_dictionary.maven.MavenMapping",
         logging.WARNING,
         "Error detecting XML encoding from foo",
     )
     caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
 
     raw_content = b"""<?xml version="1.0" encoding="foo"?>
     <project>
     </project>"""
     caplog.clear()
     result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
     assert caplog.record_tuples == [expected_warning], result
     assert result is None
 
     raw_content = b"""<?xml version="1.0" encoding="UTF-7"?>
     <project>
     </project>"""
     caplog.clear()
     result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
     assert caplog.record_tuples == [expected_warning], result
     assert result is None
 
 
 def test_compute_metadata_maven_invalid_encoding(caplog):
     expected_warning = [
         # libexpat1 <= 2.2.10-2+deb11u1
         [
             (
                 "swh.indexer.metadata_dictionary.maven.MavenMapping",
                 logging.WARNING,
                 "Error unidecoding XML from foo",
             )
         ],
         # libexpat1 >= 2.2.10-2+deb11u2
         [
             (
                 "swh.indexer.metadata_dictionary.maven.MavenMapping",
                 logging.WARNING,
                 "Error parsing XML from foo",
             )
         ],
     ]
     caplog.at_level(logging.WARNING, logger="swh.indexer.metadata_dictionary")
 
     raw_content = b"""<?xml version="1.0" encoding="UTF-8"?>
     <foo\xe5ct>
     </foo>"""
     caplog.clear()
     result = MAPPINGS["MavenMapping"]("foo").translate(raw_content)
     assert caplog.record_tuples in expected_warning, result
     assert result is None
 
 
 def test_compute_metadata_maven_minimal():
     raw_content = b"""
     <project>
       <name>Maven Default Project</name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version>1.2.3</version>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "Maven Default Project",
         "schema:identifier": "com.mycompany.app",
         "version": "1.2.3",
         "codeRepository": (
             "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
         ),
     }
 
 
 def test_compute_metadata_maven_empty_nodes():
     raw_content = b"""
     <project>
       <name>Maven Default Project</name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version>1.2.3</version>
       <repositories>
       </repositories>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "Maven Default Project",
         "schema:identifier": "com.mycompany.app",
         "version": "1.2.3",
         "codeRepository": (
             "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
         ),
     }
 
     raw_content = b"""
     <project>
       <name>Maven Default Project</name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version></version>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "Maven Default Project",
         "schema:identifier": "com.mycompany.app",
         "codeRepository": (
             "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
         ),
     }
 
     raw_content = b"""
     <project>
       <name></name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version>1.2.3</version>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "schema:identifier": "com.mycompany.app",
         "version": "1.2.3",
         "codeRepository": (
             "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
         ),
     }
 
     raw_content = b"""
     <project>
       <name>Maven Default Project</name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version>1.2.3</version>
       <licenses>
       </licenses>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "Maven Default Project",
         "schema:identifier": "com.mycompany.app",
         "version": "1.2.3",
         "codeRepository": (
             "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
         ),
     }
 
     raw_content = b"""
     <project>
       <groupId></groupId>
       <version>1.2.3</version>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "version": "1.2.3",
     }
 
 
 def test_compute_metadata_maven_invalid_licenses():
     raw_content = b"""
     <project>
       <name>Maven Default Project</name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version>1.2.3</version>
       <licenses>
         foo
       </licenses>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "Maven Default Project",
         "schema:identifier": "com.mycompany.app",
         "version": "1.2.3",
         "codeRepository": (
             "https://repo.maven.apache.org/maven2/com/mycompany/app/my-app"
         ),
     }
 
 
 def test_compute_metadata_maven_multiple():
     """Tests when there are multiple code repos and licenses."""
     raw_content = b"""
     <project>
       <name>Maven Default Project</name>
       <modelVersion>4.0.0</modelVersion>
       <groupId>com.mycompany.app</groupId>
       <artifactId>my-app</artifactId>
       <version>1.2.3</version>
       <repositories>
         <repository>
           <id>central</id>
           <name>Maven Repository Switchboard</name>
           <layout>default</layout>
           <url>http://repo1.maven.org/maven2</url>
           <snapshots>
             <enabled>false</enabled>
           </snapshots>
         </repository>
         <repository>
           <id>example</id>
           <name>Example Maven Repo</name>
           <layout>default</layout>
           <url>http://example.org/maven2</url>
         </repository>
       </repositories>
       <licenses>
         <license>
           <name>Apache License, Version 2.0</name>
           <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
           <distribution>repo</distribution>
           <comments>A business-friendly OSS license</comments>
         </license>
         <license>
           <name>MIT license</name>
           <url>https://opensource.org/licenses/MIT</url>
         </license>
       </licenses>
     </project>"""
     result = MAPPINGS["MavenMapping"]().translate(raw_content)
     assert set(result.pop("license")) == {
         "https://www.apache.org/licenses/LICENSE-2.0.txt",
         "https://opensource.org/licenses/MIT",
     }, result
     assert set(result.pop("codeRepository")) == {
         "http://repo1.maven.org/maven2/com/mycompany/app/my-app",
         "http://example.org/maven2/com/mycompany/app/my-app",
     }, result
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "Maven Default Project",
         "schema:identifier": "com.mycompany.app",
         "version": "1.2.3",
     }
 
 
+def test_compute_metadata_maven_invalid_repository():
+    raw_content = b"""
+    <project>
+      <name>Maven Default Project</name>
+      <modelVersion>4.0.0</modelVersion>
+      <groupId>com.mycompany.app</groupId>
+      <artifactId>my-app</artifactId>
+      <version>1.2.3</version>
+      <repositories>
+        <repository>
+          <id>tcc-transaction-internal-releases</id>
+          <name>internal repository for released artifacts</name>
+          <url>${repo.internal.releases.url}</url>
+          <snapshots>
+              <enabled>false</enabled>
+          </snapshots>
+          <releases>
+              <enabled>true</enabled>
+          </releases>
+        </repository>
+      </repositories>
+      <licenses>
+        <license>
+          <name>Apache License, Version 2.0</name>
+          <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
+          <distribution>repo</distribution>
+          <comments>A business-friendly OSS license</comments>
+        </license>
+      </licenses>
+    </project>"""
+    result = MAPPINGS["MavenMapping"]().translate(raw_content)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "name": "Maven Default Project",
+        "schema:identifier": "com.mycompany.app",
+        "version": "1.2.3",
+        "license": "https://www.apache.org/licenses/LICENSE-2.0.txt",
+    }
+
+
 @settings(suppress_health_check=[HealthCheck.too_slow])
 @given(
     xml_document_strategy(
         keys=list(MAPPINGS["MavenMapping"].mapping),  # type: ignore
         root="project",
         xmlns="http://maven.apache.org/POM/4.0.0",
     )
 )
 def test_maven_adversarial(doc):
     MAPPINGS["MavenMapping"]().translate(doc)
diff --git a/swh/indexer/tests/metadata_dictionary/test_npm.py b/swh/indexer/tests/metadata_dictionary/test_npm.py
index b0ead25..cdaf6b7 100644
--- a/swh/indexer/tests/metadata_dictionary/test_npm.py
+++ b/swh/indexer/tests/metadata_dictionary/test_npm.py
@@ -1,335 +1,420 @@
 # Copyright (C) 2017-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import json
 
 from hypothesis import HealthCheck, given, settings
 import pytest
 
 from swh.indexer.metadata_detector import detect_metadata
 from swh.indexer.metadata_dictionary import MAPPINGS
 from swh.indexer.storage.model import ContentMetadataRow
 
 from ..test_metadata import TRANSLATOR_TOOL, ContentMetadataTestIndexer
 from ..utils import (
     BASE_TEST_CONFIG,
     MAPPING_DESCRIPTION_CONTENT_SHA1,
     json_document_strategy,
 )
 
 
 def test_compute_metadata_none():
     """
     testing content empty content is empty
     should return None
     """
     content = b""
 
     # None if no metadata was found or an error occurred
     declared_metadata = None
     result = MAPPINGS["NpmMapping"]().translate(content)
     assert declared_metadata == result
 
 
 def test_compute_metadata_npm():
     """
     testing only computation of metadata with hard_mapping_npm
     """
     content = b"""
         {
             "name": "test_metadata",
             "version": "0.0.2",
             "description": "Simple package.json test for indexer",
               "repository": {
                 "type": "git",
                 "url": "https://github.com/moranegg/metadata_test"
             },
             "author": {
                 "email": "moranegg@example.com",
                 "name": "Morane G"
             }
         }
     """
     declared_metadata = {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "test_metadata",
         "version": "0.0.2",
         "description": "Simple package.json test for indexer",
         "codeRepository": "git+https://github.com/moranegg/metadata_test",
         "author": [
             {
                 "type": "Person",
                 "name": "Morane G",
                 "email": "moranegg@example.com",
             }
         ],
     }
 
     result = MAPPINGS["NpmMapping"]().translate(content)
     assert declared_metadata == result
 
 
 def test_compute_metadata_invalid_description_npm():
     """
     testing only computation of metadata with hard_mapping_npm
     """
     content = b"""
         {
             "name": "test_metadata",
             "version": "0.0.2",
             "description": 1234
     }
     """
     declared_metadata = {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "type": "SoftwareSourceCode",
         "name": "test_metadata",
         "version": "0.0.2",
     }
 
     result = MAPPINGS["NpmMapping"]().translate(content)
     assert declared_metadata == result
 
 
 def test_index_content_metadata_npm(storage, obj_storage):
     """
     testing NPM with package.json
     - one sha1 uses a file that can't be translated to metadata and
       should return None in the translated metadata
     """
     sha1s = [
         MAPPING_DESCRIPTION_CONTENT_SHA1["json:test-metadata-package.json"],
         MAPPING_DESCRIPTION_CONTENT_SHA1["json:npm-package.json"],
         MAPPING_DESCRIPTION_CONTENT_SHA1["python:code"],
     ]
 
     # this metadata indexer computes only metadata for package.json
     # in npm context with a hard mapping
     config = BASE_TEST_CONFIG.copy()
     config["tools"] = [TRANSLATOR_TOOL]
     metadata_indexer = ContentMetadataTestIndexer(config=config)
     metadata_indexer.run(sha1s, log_suffix="unknown content")
     results = list(metadata_indexer.idx_storage.content_metadata_get(sha1s))
 
     expected_results = [
         ContentMetadataRow(
             id=sha1s[0],
             tool=TRANSLATOR_TOOL,
             metadata={
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "codeRepository": "git+https://github.com/moranegg/metadata_test",
                 "description": "Simple package.json test for indexer",
                 "name": "test_metadata",
                 "version": "0.0.1",
             },
         ),
         ContentMetadataRow(
             id=sha1s[1],
             tool=TRANSLATOR_TOOL,
             metadata={
                 "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                 "type": "SoftwareSourceCode",
                 "issueTracker": "https://github.com/npm/npm/issues",
                 "author": [
                     {
                         "type": "Person",
                         "name": "Isaac Z. Schlueter",
                         "email": "i@izs.me",
                         "url": "http://blog.izs.me",
                     }
                 ],
                 "codeRepository": "git+https://github.com/npm/npm",
                 "description": "a package manager for JavaScript",
                 "license": "https://spdx.org/licenses/Artistic-2.0",
                 "version": "5.0.3",
                 "name": "npm",
                 "url": "https://docs.npmjs.com/",
             },
         ),
     ]
 
     for result in results:
         del result.tool["id"]
         result.metadata.pop("keywords", None)
 
     # The assertion below returns False sometimes because of nested lists
     assert expected_results == results
 
 
 def test_npm_null_list_item_normalization():
     package_json = b"""{
         "name": "foo",
         "keywords": [
             "foo",
             null
         ],
         "homepage": [
             "http://example.org/",
             null
         ]
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "type": "SoftwareSourceCode",
         "url": "http://example.org/",
         "keywords": "foo",
     }
 
 
 def test_npm_bugs_normalization():
     # valid dictionary
     package_json = b"""{
         "name": "foo",
         "bugs": {
             "url": "https://github.com/owner/project/issues",
             "email": "foo@example.com"
         }
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "issueTracker": "https://github.com/owner/project/issues",
         "type": "SoftwareSourceCode",
     }
 
     # "invalid" dictionary
     package_json = b"""{
         "name": "foo",
         "bugs": {
             "email": "foo@example.com"
         }
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "type": "SoftwareSourceCode",
     }
 
     # string
     package_json = b"""{
         "name": "foo",
         "bugs": "https://github.com/owner/project/issues"
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "issueTracker": "https://github.com/owner/project/issues",
         "type": "SoftwareSourceCode",
     }
 
 
 def test_npm_repository_normalization():
     # normal
     package_json = b"""{
         "name": "foo",
         "repository": {
             "type" : "git",
             "url" : "https://github.com/npm/cli.git"
         }
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "codeRepository": "git+https://github.com/npm/cli.git",
         "type": "SoftwareSourceCode",
     }
 
     # missing url
     package_json = b"""{
         "name": "foo",
         "repository": {
             "type" : "git"
         }
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "type": "SoftwareSourceCode",
     }
 
     # github shortcut
     package_json = b"""{
         "name": "foo",
         "repository": "github:npm/cli"
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     expected_result = {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "codeRepository": "git+https://github.com/npm/cli.git",
         "type": "SoftwareSourceCode",
     }
     assert result == expected_result
 
     # github shortshortcut
     package_json = b"""{
         "name": "foo",
         "repository": "npm/cli"
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == expected_result
 
     # gitlab shortcut
     package_json = b"""{
         "name": "foo",
         "repository": "gitlab:user/repo"
     }"""
     result = MAPPINGS["NpmMapping"]().translate(package_json)
     assert result == {
         "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
         "name": "foo",
         "codeRepository": "git+https://gitlab.com/user/repo.git",
         "type": "SoftwareSourceCode",
     }
 
 
+def test_npm_invalid_uris():
+    package_json = rb"""{
+  "version": "1.0.0",
+  "homepage": "",
+  "author": {
+    "name": "foo",
+    "url": "http://example.org"
+  }
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}],
+        "version": "1.0.0",
+    }
+
+    package_json = rb"""{
+  "version": "1.0.0",
+  "homepage": "http://example.org",
+  "author": {
+    "name": "foo",
+    "url": ""
+  }
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "foo", "type": "Person"}],
+        "url": "http://example.org",
+        "version": "1.0.0",
+    }
+
+    package_json = rb"""{
+  "version": "1.0.0",
+  "homepage": "",
+  "author": {
+    "name": "foo",
+    "url": ""
+  }
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "foo", "type": "Person"}],
+        "version": "1.0.0",
+    }
+
+    package_json = rb"""{
+  "version": "1.0.0",
+  "homepage": "http:example.org",
+  "author": {
+    "name": "foo",
+    "url": "http:example.com"
+  }
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "foo", "type": "Person"}],
+        "version": "1.0.0",
+    }
+
+
+def test_npm_invalid_licenses():
+    package_json = rb"""{
+  "version": "1.0.0",
+  "license": "SEE LICENSE IN LICENSE.md",
+  "author": {
+    "name": "foo",
+    "url": "http://example.org"
+  }
+}"""
+    result = MAPPINGS["NpmMapping"]().translate(package_json)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "type": "SoftwareSourceCode",
+        "author": [{"name": "foo", "type": "Person", "url": "http://example.org"}],
+        "version": "1.0.0",
+    }
+
+
 @settings(suppress_health_check=[HealthCheck.too_slow])
 @given(json_document_strategy(keys=list(MAPPINGS["NpmMapping"].mapping)))  # type: ignore
 def test_npm_adversarial(doc):
     raw = json.dumps(doc).encode()
     MAPPINGS["NpmMapping"]().translate(raw)
 
 
 @pytest.mark.parametrize(
     "filename", [b"package.json", b"Package.json", b"PACKAGE.json", b"PACKAGE.JSON"]
 )
 def test_detect_metadata_package_json(filename):
     df = [
         {
             "sha1_git": b"abc",
             "name": b"index.js",
             "target": b"abc",
             "length": 897,
             "status": "visible",
             "type": "file",
             "perms": 33188,
             "dir_id": b"dir_a",
             "sha1": b"bcd",
         },
         {
             "sha1_git": b"aab",
             "name": filename,
             "target": b"aab",
             "length": 712,
             "status": "visible",
             "type": "file",
             "perms": 33188,
             "dir_id": b"dir_a",
             "sha1": b"cde",
         },
     ]
     results = detect_metadata(df)
 
     expected_results = {"NpmMapping": [b"cde"]}
     assert expected_results == results
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
index 20c49c0..3ba7ad8 100644
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -1,283 +1,312 @@
 # Copyright (C) 2017-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import datetime
 from unittest.mock import call
 
 import attr
 
 from swh.indexer.metadata import (
     ContentMetadataIndexer,
     DirectoryMetadataIndexer,
     ExtrinsicMetadataIndexer,
 )
 from swh.indexer.storage.model import (
     ContentMetadataRow,
     DirectoryIntrinsicMetadataRow,
     OriginExtrinsicMetadataRow,
 )
 from swh.indexer.tests.utils import DIRECTORY2
 from swh.model.model import (
     Directory,
     DirectoryEntry,
     MetadataAuthority,
     MetadataAuthorityType,
     MetadataFetcher,
     RawExtrinsicMetadata,
 )
 from swh.model.swhids import ExtendedObjectType, ExtendedSWHID
 
 from .utils import (
     BASE_TEST_CONFIG,
     MAPPING_DESCRIPTION_CONTENT_SHA1,
     MAPPING_DESCRIPTION_CONTENT_SHA1GIT,
     YARN_PARSER_METADATA,
     fill_obj_storage,
     fill_storage,
 )
 
 TRANSLATOR_TOOL = {
     "name": "swh-metadata-translator",
     "version": "0.0.2",
     "configuration": {"type": "local", "context": "NpmMapping"},
 }
 
 
 class ContentMetadataTestIndexer(ContentMetadataIndexer):
     """Specific Metadata whose configuration is enough to satisfy the
     indexing tests.
     """
 
     def parse_config_file(self, *args, **kwargs):
         assert False, "should not be called; the dir indexer configures it."
 
 
 DIRECTORY_METADATA_CONFIG = {
     **BASE_TEST_CONFIG,
     "tools": TRANSLATOR_TOOL,
 }
 
 REMD = RawExtrinsicMetadata(
     target=ExtendedSWHID(
         object_type=ExtendedObjectType.ORIGIN,
         object_id=b"\x01" * 20,
     ),
     discovery_date=datetime.datetime.now(tz=datetime.timezone.utc),
     authority=MetadataAuthority(
         type=MetadataAuthorityType.FORGE,
         url="https://example.org/",
     ),
     fetcher=MetadataFetcher(
         name="example-fetcher",
         version="1.0.0",
     ),
     format="application/vnd.github.v3+json",
     metadata=b'{"full_name": "test software"}',
 )
 
 
 class TestMetadata:
     """
     Tests metadata_mock_tool tool for Metadata detection
     """
 
     def test_directory_metadata_indexer(self):
         metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         fill_obj_storage(metadata_indexer.objstorage)
         fill_storage(metadata_indexer.storage)
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
         dir_ = DIRECTORY2
 
         assert (
             dir_.entries[0].target
             == MAPPING_DESCRIPTION_CONTENT_SHA1GIT["json:yarn-parser-package.json"]
         )
 
         metadata_indexer.idx_storage.content_metadata_add(
             [
                 ContentMetadataRow(
                     id=MAPPING_DESCRIPTION_CONTENT_SHA1[
                         "json:yarn-parser-package.json"
                     ],
                     indexer_configuration_id=tool["id"],
                     metadata=YARN_PARSER_METADATA,
                 )
             ]
         )
 
         metadata_indexer.run([dir_.id])
 
         results = list(
             metadata_indexer.idx_storage.directory_intrinsic_metadata_get([dir_.id])
         )
 
         expected_results = [
             DirectoryIntrinsicMetadataRow(
                 id=dir_.id,
                 tool=TRANSLATOR_TOOL,
                 metadata=YARN_PARSER_METADATA,
                 mappings=["npm"],
             )
         ]
 
         for result in results:
             del result.tool["id"]
 
         assert results == expected_results
 
     def test_directory_metadata_indexer_single_root_dir(self):
         metadata_indexer = DirectoryMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         fill_obj_storage(metadata_indexer.objstorage)
         fill_storage(metadata_indexer.storage)
 
         # Add a parent directory, that is the only directory at the root
         # of the directory
         dir_ = DIRECTORY2
         assert (
             dir_.entries[0].target
             == MAPPING_DESCRIPTION_CONTENT_SHA1GIT["json:yarn-parser-package.json"]
         )
 
         new_dir = Directory(
             entries=(
                 DirectoryEntry(
                     name=b"foobar-1.0.0",
                     type="dir",
                     target=dir_.id,
                     perms=16384,
                 ),
             ),
         )
         assert new_dir.id is not None
         metadata_indexer.storage.directory_add([new_dir])
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
 
         metadata_indexer.idx_storage.content_metadata_add(
             [
                 ContentMetadataRow(
                     id=MAPPING_DESCRIPTION_CONTENT_SHA1[
                         "json:yarn-parser-package.json"
                     ],
                     indexer_configuration_id=tool["id"],
                     metadata=YARN_PARSER_METADATA,
                 )
             ]
         )
 
         metadata_indexer.run([new_dir.id])
 
         results = list(
             metadata_indexer.idx_storage.directory_intrinsic_metadata_get([new_dir.id])
         )
 
         expected_results = [
             DirectoryIntrinsicMetadataRow(
                 id=new_dir.id,
                 tool=TRANSLATOR_TOOL,
                 metadata=YARN_PARSER_METADATA,
                 mappings=["npm"],
             )
         ]
 
         for result in results:
             del result.tool["id"]
 
         assert results == expected_results
 
     def test_extrinsic_metadata_indexer_unknown_format(self, mocker):
         """Should be ignored when unknown format"""
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
 
         remd = attr.evolve(REMD, format="unknown format")
 
         results = metadata_indexer.index(remd.id, data=remd)
 
         assert metadata_indexer.storage.method_calls == []
         assert results == []
 
     def test_extrinsic_metadata_indexer_github(self, mocker):
         """Nominal case, calling the mapping and storing the result"""
         origin = "https://example.org/jdoe/myrepo"
 
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.catch_exceptions = False
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
         metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
 
         assert metadata_indexer.process_journal_objects(
             {"raw_extrinsic_metadata": [REMD.to_dict()]}
         ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
 
         assert metadata_indexer.storage.method_calls == [
             call.origin_get_by_sha1([b"\x01" * 20])
         ]
 
         results = list(
             metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
         )
         assert results == [
             OriginExtrinsicMetadataRow(
                 id="https://example.org/jdoe/myrepo",
                 tool={"id": tool["id"], **TRANSLATOR_TOOL},
                 metadata={
                     "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
                     "type": "https://forgefed.org/ns#Repository",
                     "name": "test software",
                 },
                 from_remd_id=REMD.id,
                 mappings=["github"],
             )
         ]
 
     def test_extrinsic_metadata_indexer_nonforge_authority(self, mocker):
         """Early abort on non-forge authorities"""
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
 
         remd = attr.evolve(
             REMD,
             authority=attr.evolve(REMD.authority, type=MetadataAuthorityType.REGISTRY),
         )
 
         results = metadata_indexer.index(remd.id, data=remd)
 
         assert metadata_indexer.storage.method_calls == []
         assert results == []
 
     def test_extrinsic_metadata_indexer_thirdparty_authority(self, mocker):
         """Should be ignored when authority URL does not match the origin"""
 
         origin = "https://different-domain.example.org/jdoe/myrepo"
 
         metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
         metadata_indexer.catch_exceptions = False
         metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
         metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
 
         tool = metadata_indexer.idx_storage.indexer_configuration_get(
             {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
         )
         assert tool is not None
 
         results = metadata_indexer.index(REMD.id, data=REMD)
 
         assert metadata_indexer.storage.method_calls == [
             call.origin_get_by_sha1([b"\x01" * 20])
         ]
         assert results == []
+
+    def test_extrinsic_metadata_indexer_duplicate_origin(self, mocker):
+        """Nominal case, calling the mapping and storing the result"""
+        origin = "https://example.org/jdoe/myrepo"
+
+        metadata_indexer = ExtrinsicMetadataIndexer(config=DIRECTORY_METADATA_CONFIG)
+        metadata_indexer.catch_exceptions = False
+        metadata_indexer.storage = mocker.patch.object(metadata_indexer, "storage")
+        metadata_indexer.storage.origin_get_by_sha1.return_value = [{"url": origin}]
+
+        tool = metadata_indexer.idx_storage.indexer_configuration_get(
+            {f"tool_{k}": v for (k, v) in TRANSLATOR_TOOL.items()}
+        )
+        assert tool is not None
+
+        assert metadata_indexer.process_journal_objects(
+            {
+                "raw_extrinsic_metadata": [
+                    REMD.to_dict(),
+                    {**REMD.to_dict(), "id": b"\x00" * 20},
+                ]
+            }
+        ) == {"status": "eventful", "origin_extrinsic_metadata:add": 1}
+
+        results = list(
+            metadata_indexer.idx_storage.origin_extrinsic_metadata_get([origin])
+        )
+        assert len(results) == 1, results
+        assert results[0].from_remd_id == b"\x00" * 20
diff --git a/swh/indexer/tests/test_origin_metadata.py b/swh/indexer/tests/test_origin_metadata.py
index 567f479..4b7057e 100644
--- a/swh/indexer/tests/test_origin_metadata.py
+++ b/swh/indexer/tests/test_origin_metadata.py
@@ -1,356 +1,409 @@
-# Copyright (C) 2018-2020  The Software Heritage developers
+# Copyright (C) 2018-2022  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import copy
 from unittest.mock import patch
 
+import attr
 import pytest
 
 from swh.indexer.metadata import OriginMetadataIndexer
 from swh.indexer.storage.interface import IndexerStorageInterface
 from swh.indexer.storage.model import (
     DirectoryIntrinsicMetadataRow,
     OriginIntrinsicMetadataRow,
 )
 from swh.model.model import Origin
 from swh.storage.interface import StorageInterface
 
 from .test_metadata import TRANSLATOR_TOOL
 from .utils import DIRECTORY2, YARN_PARSER_METADATA
 
 
 @pytest.fixture
 def swh_indexer_config(swh_indexer_config):
     """Override the default configuration to override the tools entry"""
     cfg = copy.deepcopy(swh_indexer_config)
     cfg["tools"] = TRANSLATOR_TOOL
     return cfg
 
 
 def test_origin_metadata_indexer_release(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://npm.example.org/yarn-parser"
     indexer.run([origin])
 
     tool = swh_indexer_config["tools"]
 
     dir_id = DIRECTORY2.id
     dir_metadata = DirectoryIntrinsicMetadataRow(
         id=dir_id,
         tool=tool,
         metadata=YARN_PARSER_METADATA,
         mappings=["npm"],
     )
     origin_metadata = OriginIntrinsicMetadataRow(
         id=origin,
         tool=tool,
         from_directory=dir_id,
         metadata=YARN_PARSER_METADATA,
         mappings=["npm"],
     )
 
     dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id]))
     for dir_result in dir_results:
         assert dir_result.tool
         del dir_result.tool["id"]
     assert dir_results == [dir_metadata]
 
     orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin]))
     for orig_result in orig_results:
         assert orig_result.tool
         del orig_result.tool["id"]
     assert orig_results == [origin_metadata]
 
 
 def test_origin_metadata_indexer_revision(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://github.com/librariesio/yarn-parser"
     indexer.run([origin])
 
     tool = swh_indexer_config["tools"]
 
     dir_id = DIRECTORY2.id
     dir_metadata = DirectoryIntrinsicMetadataRow(
         id=dir_id,
         tool=tool,
         metadata=YARN_PARSER_METADATA,
         mappings=["npm"],
     )
     origin_metadata = OriginIntrinsicMetadataRow(
         id=origin,
         tool=tool,
         from_directory=dir_id,
         metadata=YARN_PARSER_METADATA,
         mappings=["npm"],
     )
 
     dir_results = list(idx_storage.directory_intrinsic_metadata_get([dir_id]))
     for dir_result in dir_results:
         assert dir_result.tool
         del dir_result.tool["id"]
     assert dir_results == [dir_metadata]
 
     orig_results = list(idx_storage.origin_intrinsic_metadata_get([origin]))
     for orig_result in orig_results:
         assert orig_result.tool
         del orig_result.tool["id"]
     assert orig_results == [origin_metadata]
 
 
 def test_origin_metadata_indexer_duplicate_origin(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     indexer.storage = storage
     indexer.idx_storage = idx_storage
     indexer.run(["https://github.com/librariesio/yarn-parser"])
     indexer.run(["https://github.com/librariesio/yarn-parser"] * 2)
 
     origin = "https://github.com/librariesio/yarn-parser"
     dir_id = DIRECTORY2.id
 
     dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
     assert len(dir_results) == 1
 
     orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert len(orig_results) == 1
 
 
 def test_origin_metadata_indexer_missing_head(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
     storage.origin_add([Origin(url="https://example.com")])
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     indexer.run(["https://example.com"])
 
     origin = "https://example.com"
 
     results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert results == []
 
 
 def test_origin_metadata_indexer_partial_missing_head(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
 
     origin1 = "https://example.com"
     origin2 = "https://github.com/librariesio/yarn-parser"
     storage.origin_add([Origin(url=origin1)])
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     indexer.run([origin1, origin2])
 
     dir_id = DIRECTORY2.id
 
     dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
     assert dir_results == [
         DirectoryIntrinsicMetadataRow(
             id=dir_id,
             metadata=YARN_PARSER_METADATA,
             mappings=["npm"],
             tool=dir_results[0].tool,
         )
     ]
 
     orig_results = list(
         indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
     )
     for orig_result in orig_results:
         assert orig_results == [
             OriginIntrinsicMetadataRow(
                 id=origin2,
                 from_directory=dir_id,
                 metadata=YARN_PARSER_METADATA,
                 mappings=["npm"],
                 tool=orig_results[0].tool,
             )
         ]
 
 
 def test_origin_metadata_indexer_duplicate_directory(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     indexer.storage = storage
     indexer.idx_storage = idx_storage
     indexer.catch_exceptions = False
     origin1 = "https://github.com/librariesio/yarn-parser"
     origin2 = "https://github.com/librariesio/yarn-parser.git"
     indexer.run([origin1, origin2])
 
     dir_id = DIRECTORY2.id
 
     dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
     assert len(dir_results) == 1
 
     orig_results = list(
         indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
     )
     assert len(orig_results) == 2
 
 
+def test_origin_metadata_indexer_duplicate_directory_different_result(
+    swh_indexer_config,
+    idx_storage: IndexerStorageInterface,
+    storage: StorageInterface,
+    obj_storage,
+    mocker,
+) -> None:
+    """Same as above, but indexing the same directory twice resulted in different
+    data (because list order differs).
+    """
+    indexer = OriginMetadataIndexer(config=swh_indexer_config)
+    indexer.storage = storage
+    indexer.idx_storage = idx_storage
+    indexer.catch_exceptions = False
+    origin1 = "https://github.com/librariesio/yarn-parser"
+    origin2 = "https://github.com/librariesio/yarn-parser.git"
+
+    directory_index = indexer.directory_metadata_indexer.index
+
+    nb_calls = 0
+
+    def side_effect(dir_id):
+        nonlocal nb_calls
+        if nb_calls == 0:
+            keywords = ["foo", "bar"]
+        elif nb_calls == 1:
+            keywords = ["bar", "foo"]
+        else:
+            assert False, nb_calls
+        nb_calls += 1
+        return [
+            attr.evolve(row, metadata={**row.metadata, "keywords": keywords})
+            for row in directory_index(dir_id)
+        ]
+
+    mocker.patch.object(
+        indexer.directory_metadata_indexer, "index", side_effect=side_effect
+    )
+
+    indexer.run([origin1, origin2])
+
+    dir_id = DIRECTORY2.id
+
+    dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
+    assert len(dir_results) == 1
+
+    orig_results = list(
+        indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2])
+    )
+    assert len(orig_results) == 2
+
+
 def test_origin_metadata_indexer_no_metadata_file(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://github.com/librariesio/yarn-parser"
     with patch("swh.indexer.metadata_dictionary.npm.NpmMapping.filename", b"foo.json"):
         indexer.run([origin])
 
     dir_id = DIRECTORY2.id
 
     dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
     assert dir_results == []
 
     orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert orig_results == []
 
 
 def test_origin_metadata_indexer_no_metadata(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://github.com/librariesio/yarn-parser"
     with patch(
         "swh.indexer.metadata.DirectoryMetadataIndexer"
         ".translate_directory_intrinsic_metadata",
         return_value=(["npm"], {"@context": "foo"}),
     ):
         indexer.run([origin])
 
     dir_id = DIRECTORY2.id
 
     dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
     assert dir_results == []
 
     orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert orig_results == []
 
 
 @pytest.mark.parametrize("catch_exceptions", [True, False])
 def test_origin_metadata_indexer_directory_error(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
     sentry_events,
     catch_exceptions,
 ) -> None:
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://github.com/librariesio/yarn-parser"
 
     indexer.catch_exceptions = catch_exceptions
 
     with patch(
         "swh.indexer.metadata.DirectoryMetadataIndexer"
         ".translate_directory_intrinsic_metadata",
         return_value=None,
     ):
         indexer.run([origin])
 
     assert len(sentry_events) == 1
     sentry_event = sentry_events.pop()
     assert sentry_event.get("tags") == {
         "swh-indexer-origin-head-swhid": (
             "swh:1:rev:a78410ce2f78f5078fd4ee7edb8c82c02a4a712c"
         ),
         "swh-indexer-origin-url": origin,
     }
     assert "'TypeError'" in str(sentry_event)
 
     dir_id = DIRECTORY2.id
 
     dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
     assert dir_results == []
 
     orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert orig_results == []
 
 
 @pytest.mark.parametrize("catch_exceptions", [True, False])
 def test_origin_metadata_indexer_content_exception(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
     sentry_events,
     catch_exceptions,
 ) -> None:
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     origin = "https://github.com/librariesio/yarn-parser"
 
     indexer.catch_exceptions = catch_exceptions
 
     class TestException(Exception):
         pass
 
     with patch(
         "swh.indexer.metadata.ContentMetadataRow",
         side_effect=TestException(),
     ):
         indexer.run([origin])
 
     assert len(sentry_events) == 1
     sentry_event = sentry_events.pop()
     assert sentry_event.get("tags") == {
         "swh-indexer-content-sha1": "df9d3bcc0158faa446bd1af225f8e2e4afa576d7",
         "swh-indexer-origin-head-swhid": (
             "swh:1:rev:a78410ce2f78f5078fd4ee7edb8c82c02a4a712c"
         ),
         "swh-indexer-origin-url": origin,
     }
     assert ".TestException'" in str(sentry_event), sentry_event
 
     dir_id = DIRECTORY2.id
 
     dir_results = list(indexer.idx_storage.directory_intrinsic_metadata_get([dir_id]))
     assert dir_results == []
 
     orig_results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
     assert orig_results == []
 
 
 def test_origin_metadata_indexer_unknown_origin(
     swh_indexer_config,
     idx_storage: IndexerStorageInterface,
     storage: StorageInterface,
     obj_storage,
 ) -> None:
 
     indexer = OriginMetadataIndexer(config=swh_indexer_config)
     result = indexer.index_list([Origin("https://unknown.org/foo")])
     assert not result