diff --git a/PKG-INFO b/PKG-INFO
index da1d5fb..838ac17 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,69 +1,69 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 0.0.58
+Version: 0.0.59
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
 Description: swh-indexer
         ============
         
         Tools to compute multiple indexes on SWH's raw contents:
         - content:
           - mimetype
           - ctags
           - language
           - fossology-license
           - metadata
         - revision:
           - metadata
         
         An indexer is in charge of:
         - looking up objects
         - extracting information from those objects
         - store those information in the swh-indexer db
         
         There are multiple indexers working on different object types:
           - content indexer: works with content sha1 hashes
           - revision indexer: works with revision sha1 hashes
           - origin indexer: works with origin identifiers
         
         Indexation procedure:
         - receive batch of ids
         - retrieve the associated data depending on object type
         - compute for that object some index
         - store the result to swh's storage
         
         Current content indexers:
         
         - mimetype (queue swh_indexer_content_mimetype): detect the encoding
           and mimetype
         
         - language (queue swh_indexer_content_language): detect the
           programming language
         
         - ctags (queue swh_indexer_content_ctags): compute tags information
         
         - fossology-license (queue swh_indexer_fossology_license): compute the
           license
         
         - metadata: translate file into translated_metadata dict
         
         Current revision indexers:
         
         - metadata: detects files containing metadata and retrieves translated_metadata
           in content_metadata table in storage or run content indexer to translate
           files.
         
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Description-Content-Type: text/markdown
 Provides-Extra: testing
diff --git a/debian/changelog b/debian/changelog
index ef9a547..f43431e 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,469 +1,473 @@
-swh-indexer (0.0.58-1~swh1~bpo9+1) stretch-swh; urgency=medium
+swh-indexer (0.0.59-1~swh1) unstable-swh; urgency=medium
 
-  * Rebuild for stretch-backports.
+  * v0.0.59
+  * fossology license: Fix issue on license computation
+  * Improve docstrings
+  * Fix pep8 violations
+  * Increase coverage on content indexers
 
- -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 20 Nov 2018 12:06:56 +0100
+ -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 20 Nov 2018 14:27:20 +0100
 
 swh-indexer (0.0.58-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.58
   * Add missing default configuration for fossology license indexer
   * tests: Remove dead code
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 20 Nov 2018 12:06:56 +0100
 
 swh-indexer (0.0.57-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.57
   * storage: Open new endpoint on fossology license range retrieval
   * indexer: Open new fossology license range indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 20 Nov 2018 11:44:57 +0100
 
 swh-indexer (0.0.56-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.56
   * storage.api: Open new endpoints (mimetype range, fossology range)
   * content indexers: Open mimetype and fossology range indexers
   * Remove orchestrator modules
   * tests: Improve coverage
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 19 Nov 2018 11:56:06 +0100
 
 swh-indexer (0.0.55-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.55
   * swh.indexer: Let task reschedule itself through the scheduler
   * Use swh.scheduler instead of celery leaking all around
   * swh.indexer.orchestrator: Fix orchestrator initialization step
   * swh.indexer.tasks: Fix type error when no result or list result
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 29 Oct 2018 10:41:54 +0100
 
 swh-indexer (0.0.54-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.54
   * swh.indexer.tasks: Fix task to use the scheduler's
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 25 Oct 2018 20:13:51 +0200
 
 swh-indexer (0.0.53-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.53
   * swh.indexer.rehash: Migrate to latest swh.model.hashutil.MultiHash
   * indexer: Add the origin intrinsic metadata indexer
   * indexer: Add OriginIndexer and OriginHeadIndexer.
   * indexer.storage: Add the origin intrinsic metadata storage database
   * indexer.storage: Autogenerate the Indexer Storage HTTP API.
   * setup: prepare for pypi upload
   * tests: Add a tox file
   * tests: migrate to pytest
   * tests: Add tests around celery stack
   * docs: Improve documentation and reuse README in generated
     documentation
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 25 Oct 2018 19:03:56 +0200
 
 swh-indexer (0.0.52-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.52
   * swh.indexer.storage: Refactor fossology license get (first external
   * contribution, cf. /CONTRIBUTORS)
   * swh.indexer.storage: Fix typo in invariable name metadata
   * swh.indexer.storage: No longer use temp table when reading data
   * swh.indexer.storage: Clean up unused import
   * swh.indexer.storage: Remove dead entry points origin_metadata*
   * swh.indexer.storage: Update docstrings information and format
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 13 Jun 2018 11:20:40 +0200
 
 swh-indexer (0.0.51-1~swh1) unstable-swh; urgency=medium
 
   * Release swh.indexer v0.0.51
   * Update for new db_transaction{,_generator}
 
  -- Nicolas Dandrimont <nicolas@dandrimont.eu>  Tue, 05 Jun 2018 14:10:39 +0200
 
 swh-indexer (0.0.50-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.50
   * swh.indexer.api.client: Permit to specify the query timeout option
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 24 May 2018 12:19:06 +0200
 
 swh-indexer (0.0.49-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.49
   * test_storage: Instantiate the tools during tests' setUp phase
   * test_storage: Deallocate storage during teardown step
   * test_storage: Make storage test fixture connect to postgres itself
   * storage.api.server: Only instantiate storage backend once per import
   * Use thread-aware psycopg2 connection pooling for database access
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 14 May 2018 11:09:30 +0200
 
 swh-indexer (0.0.48-1~swh1) unstable-swh; urgency=medium
 
   * Release swh.indexer v0.0.48
   * Update for new swh.storage
 
  -- Nicolas Dandrimont <nicolas@dandrimont.eu>  Sat, 12 May 2018 18:30:10 +0200
 
 swh-indexer (0.0.47-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.47
   * d/control: Fix runtime typo in packaging dependency
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 07 Dec 2017 16:54:49 +0100
 
 swh-indexer (0.0.46-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.46
   * Split swh-indexer packages in 2 python3-swh.indexer.storage and
   * python3-swh.indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 07 Dec 2017 16:18:04 +0100
 
 swh-indexer (0.0.45-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.45
   * Fix usual error raised when deploying
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 07 Dec 2017 15:01:01 +0100
 
 swh-indexer (0.0.44-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.44
   * swh.indexer: Make indexer use their own storage
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 07 Dec 2017 13:20:44 +0100
 
 swh-indexer (0.0.43-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.43
   * swh.indexer.mimetype: Work around problem in detection
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 29 Nov 2017 10:26:11 +0100
 
 swh-indexer (0.0.42-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.42
   * swh.indexer: Make indexers register tools in prepare method
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 24 Nov 2017 11:26:03 +0100
 
 swh-indexer (0.0.41-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.41
   * mimetype: Use magic library api instead of parsing `file` cli output
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 20 Nov 2017 13:05:29 +0100
 
 swh-indexer (0.0.39-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.39
   * swh.indexer.producer: Fix argument to match the abstract definition
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 19 Oct 2017 10:03:44 +0200
 
 swh-indexer (0.0.38-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.38
   * swh.indexer.indexer: Fix argument to match the abstract definition
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 18 Oct 2017 19:57:47 +0200
 
 swh-indexer (0.0.37-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.37
   * swh.indexer.indexer: Fix argument to match the abstract definition
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 18 Oct 2017 18:59:42 +0200
 
 swh-indexer (0.0.36-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.36
   * packaging: Cleanup
   * codemeta: Adding codemeta.json file to document metadata
   * swh.indexer.mimetype: Fix edge case regarding empty raw content
   * docs: sanitize docstrings for sphinx documentation generation
   * swh.indexer.metadata: Add RevisionMetadataIndexer
   * swh.indexer.metadata: Add ContentMetadataIndexer
   * swh.indexer: Refactor base class to improve inheritance
   * swh.indexer.metadata: First draft of the metadata content indexer
   * for npm (package.json)
   * swh.indexer.tests: Added tests for language indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 18 Oct 2017 16:24:24 +0200
 
 swh-indexer (0.0.35-1~swh1) unstable-swh; urgency=medium
 
   * Release swh.indexer 0.0.35
   * Update tasks to new swh.scheduler API
 
  -- Nicolas Dandrimont <nicolas@dandrimont.eu>  Mon, 12 Jun 2017 18:02:04 +0200
 
 swh-indexer (0.0.34-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.34
   * Fix unbound local error on edge case
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 07 Jun 2017 11:23:29 +0200
 
 swh-indexer (0.0.33-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.33
   * language indexer: Improve edge case policy
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 07 Jun 2017 11:02:47 +0200
 
 swh-indexer (0.0.32-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.32
   * Update fossology license to use the latest swh-storage
   * Improve language indexer to deal with potential error on bad
   * chunking
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 06 Jun 2017 18:13:40 +0200
 
 swh-indexer (0.0.31-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.31
   * Reduce log verbosity on language indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 02 Jun 2017 19:08:52 +0200
 
 swh-indexer (0.0.30-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.30
   * Fix wrong default configuration
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 02 Jun 2017 18:01:27 +0200
 
 swh-indexer (0.0.29-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.29
   * Update indexer to resolve indexer configuration identifier
   * Adapt language indexer to use partial raw content
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 02 Jun 2017 16:21:27 +0200
 
 swh-indexer (0.0.28-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.28
   * Add error resilience to fossology indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 22 May 2017 12:57:55 +0200
 
 swh-indexer (0.0.27-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.27
   * swh.indexer.language: Incremental encoding detection
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 17 May 2017 18:04:27 +0200
 
 swh-indexer (0.0.26-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.26
   * swh.indexer.orchestrator: Add batch size option per indexer
   * Log caught exception in a unified manner
   * Add rescheduling option (not by default) on rehash + indexers
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 17 May 2017 14:08:07 +0200
 
 swh-indexer (0.0.25-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.25
   * Add reschedule on error parameter for indexers
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 12 May 2017 12:13:15 +0200
 
 swh-indexer (0.0.24-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.24
   * Make rehash indexer more resilient to errors by rescheduling
     contents
   * in error (be it reading or updating problems)
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 04 May 2017 14:22:43 +0200
 
 swh-indexer (0.0.23-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.23
   * Improve producer to optionally make it synchroneous
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 03 May 2017 15:29:44 +0200
 
 swh-indexer (0.0.22-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.22
   * Improve mimetype indexer implementation
   * Make the chaining option in the mimetype indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 02 May 2017 16:31:14 +0200
 
 swh-indexer (0.0.21-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.21
   * swh.indexer.rehash: Actually make the worker log
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 02 May 2017 14:28:55 +0200
 
 swh-indexer (0.0.20-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.20
   * swh.indexer.rehash:
   * Improve reading from objstorage only when needed
   * Fix empty file use case (which was skipped)
   * Add logging
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 28 Apr 2017 09:39:09 +0200
 
 swh-indexer (0.0.19-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.19
   * Fix rehash indexer's default configuration file
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 27 Apr 2017 19:17:20 +0200
 
 swh-indexer (0.0.18-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.18
   * Add new rehash indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 26 Apr 2017 15:23:02 +0200
 
 swh-indexer (0.0.17-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.17
   * Add information on indexer tools (T610)
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 02 Dec 2016 18:32:54 +0100
 
 swh-indexer (0.0.16-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.16
   * bug fixes
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 15 Nov 2016 19:31:52 +0100
 
 swh-indexer (0.0.15-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.15
   * Improve message producer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 15 Nov 2016 18:16:42 +0100
 
 swh-indexer (0.0.14-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.14
   * Update package dependency on fossology-nomossa
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Tue, 15 Nov 2016 14:13:41 +0100
 
 swh-indexer (0.0.13-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.13
   * Add new license indexer
   * ctags indexer: align behavior with other indexers regarding the
   * conflict update policy
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Mon, 14 Nov 2016 14:13:34 +0100
 
 swh-indexer (0.0.12-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.12
   * Add runtime dependency on universal-ctags
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 04 Nov 2016 13:59:59 +0100
 
 swh-indexer (0.0.11-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.11
   * Remove dependency on exuberant-ctags
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 03 Nov 2016 16:13:26 +0100
 
 swh-indexer (0.0.10-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.10
   * Add ctags indexer
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 20 Oct 2016 16:12:42 +0200
 
 swh-indexer (0.0.9-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.9
   * d/control: Bump dependency to latest python3-swh.storage api
   * mimetype: Use the charset to filter out data
   * orchestrator: Separate 2 distincts orchestrators (one for all
   * contents, one for text contents)
   * mimetype: once index computed, send text contents to text
     orchestrator
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 13 Oct 2016 15:28:17 +0200
 
 swh-indexer (0.0.8-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.8
   * Separate configuration file per indexer (no need for language)
   * Rename module file_properties to mimetype consistently with other
   * layers
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Sat, 08 Oct 2016 11:46:29 +0200
 
 swh-indexer (0.0.7-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.7
   * Adapt indexer language and mimetype to store result in storage.
   * Clean up obsolete code
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Sat, 08 Oct 2016 10:26:08 +0200
 
 swh-indexer (0.0.6-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.6
   * Fix multiple issues on production
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 30 Sep 2016 17:00:11 +0200
 
 swh-indexer (0.0.5-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.5
   * Fix debian/control dependency issue
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 30 Sep 2016 16:06:20 +0200
 
 swh-indexer (0.0.4-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.4
   * Upgrade dependencies issues
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 30 Sep 2016 16:01:52 +0200
 
 swh-indexer (0.0.3-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.3
   * Add encoding detection
   * Use encoding to improve language detection
   * bypass language detection for binary files
   * bypass ctags for binary files or decoding failure file
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Fri, 30 Sep 2016 12:30:11 +0200
 
 swh-indexer (0.0.2-1~swh1) unstable-swh; urgency=medium
 
   * v0.0.2
   * Provide one possible sha1's name for the multiple tools to ease
   * information extrapolation
   * Fix debian package dependency issue
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Thu, 29 Sep 2016 21:45:44 +0200
 
 swh-indexer (0.0.1-1~swh1) unstable-swh; urgency=medium
 
   * Initial release
   * v0.0.1
   * First implementation on poc
 
  -- Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com>  Wed, 28 Sep 2016 23:40:13 +0200
diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO
index da1d5fb..838ac17 100644
--- a/swh.indexer.egg-info/PKG-INFO
+++ b/swh.indexer.egg-info/PKG-INFO
@@ -1,69 +1,69 @@
 Metadata-Version: 2.1
 Name: swh.indexer
-Version: 0.0.58
+Version: 0.0.59
 Summary: Software Heritage Content Indexer
 Home-page: https://forge.softwareheritage.org/diffusion/78/
 Author: Software Heritage developers
 Author-email: swh-devel@inria.fr
 License: UNKNOWN
 Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
 Project-URL: Funding, https://www.softwareheritage.org/donate
 Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer
 Description: swh-indexer
         ============
         
         Tools to compute multiple indexes on SWH's raw contents:
         - content:
           - mimetype
           - ctags
           - language
           - fossology-license
           - metadata
         - revision:
           - metadata
         
         An indexer is in charge of:
         - looking up objects
         - extracting information from those objects
         - store those information in the swh-indexer db
         
         There are multiple indexers working on different object types:
           - content indexer: works with content sha1 hashes
           - revision indexer: works with revision sha1 hashes
           - origin indexer: works with origin identifiers
         
         Indexation procedure:
         - receive batch of ids
         - retrieve the associated data depending on object type
         - compute for that object some index
         - store the result to swh's storage
         
         Current content indexers:
         
         - mimetype (queue swh_indexer_content_mimetype): detect the encoding
           and mimetype
         
         - language (queue swh_indexer_content_language): detect the
           programming language
         
         - ctags (queue swh_indexer_content_ctags): compute tags information
         
         - fossology-license (queue swh_indexer_fossology_license): compute the
           license
         
         - metadata: translate file into translated_metadata dict
         
         Current revision indexers:
         
         - metadata: detects files containing metadata and retrieves translated_metadata
           in content_metadata table in storage or run content indexer to translate
           files.
         
 Platform: UNKNOWN
 Classifier: Programming Language :: Python :: 3
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Development Status :: 5 - Production/Stable
 Description-Content-Type: text/markdown
 Provides-Extra: testing
diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py
index 0679692..492e7c0 100644
--- a/swh/indexer/ctags.py
+++ b/swh/indexer/ctags.py
@@ -1,156 +1,155 @@
 # Copyright (C) 2015-2017  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-import click
 import subprocess
 import json
 
 from swh.model import hashutil
 
 from .language import compute_language
 from .indexer import ContentIndexer, DiskIndexer
 
 
 # Options used to compute tags
 __FLAGS = [
     '--fields=+lnz',  # +l: language
                       # +n: line number of tag definition
                       # +z: include the symbol's kind (function, variable, ...)
     '--sort=no',      # sort output on tag name
     '--links=no',     # do not follow symlinks
     '--output-format=json',  # outputs in json
 ]
 
 
 def run_ctags(path, lang=None, ctags_command='ctags'):
     """Run ctags on file path with optional language.
 
     Args:
         path: path to the file
         lang: language for that path (optional)
 
     Returns:
         ctags' output
 
     """
     optional = []
     if lang:
         optional = ['--language-force=%s' % lang]
 
     cmd = [ctags_command] + __FLAGS + optional + [path]
     output = subprocess.check_output(cmd, universal_newlines=True)
 
     for symbol in output.split('\n'):
         if not symbol:
             continue
         js_symbol = json.loads(symbol)
         yield {
             'name': js_symbol['name'],
             'kind': js_symbol['kind'],
             'line': js_symbol['line'],
             'lang': js_symbol['language'],
         }
 
 
 class CtagsIndexer(ContentIndexer, DiskIndexer):
     CONFIG_BASE_FILENAME = 'indexer/ctags'
 
     ADDITIONAL_CONFIG = {
         'workdir': ('str', '/tmp/swh/indexer.ctags'),
         'tools': ('dict', {
             'name': 'universal-ctags',
             'version': '~git7859817b',
             'configuration': {
                 'command_line': '''ctags --fields=+lnz --sort=no --links=no '''
                                 '''--output-format=json <filepath>'''
             },
         }),
         'languages': ('dict', {
             'ada': 'Ada',
             'adl': None,
             'agda': None,
             # ...
         })
     }
 
     def prepare(self):
         super().prepare()
         self.working_directory = self.config['workdir']
         self.language_map = self.config['languages']
         self.tool = self.tools[0]
 
     def filter(self, ids):
         """Filter out known sha1s and return only missing ones.
 
         """
         yield from self.idx_storage.content_ctags_missing((
             {
                 'id': sha1,
                 'indexer_configuration_id': self.tool['id'],
             } for sha1 in ids
         ))
 
     def compute_ctags(self, path, lang):
         """Compute ctags on file at path with language lang.
 
         """
         return run_ctags(path, lang=lang)
 
     def index(self, id, data):
         """Index sha1s' content and store result.
 
         Args:
             id (bytes): content's identifier
             data (bytes): raw content in bytes
 
         Returns:
             A dict, representing a content_mimetype, with keys:
               - id (bytes): content's identifier (sha1)
               - ctags ([dict]): ctags list of symbols
 
         """
         lang = compute_language(data, log=self.log)['lang']
 
         if not lang:
             return None
 
         ctags_lang = self.language_map.get(lang)
 
         if not ctags_lang:
             return None
 
         ctags = {
             'id': id,
         }
 
         filename = hashutil.hash_to_hex(id)
         content_path = self.write_to_temp(
             filename=filename,
             data=data)
 
         result = run_ctags(content_path, lang=ctags_lang)
         ctags.update({
             'ctags': list(result),
             'indexer_configuration_id': self.tool['id'],
         })
 
         self.cleanup(content_path)
 
         return ctags
 
     def persist_index_computations(self, results, policy_update):
         """Persist the results in storage.
 
         Args:
             results ([dict]): list of content_mimetype, dict with the
             following keys:
               - id (bytes): content's identifier (sha1)
               - ctags ([dict]): ctags list of symbols
             policy_update ([str]): either 'update-dups' or 'ignore-dups' to
             respectively update duplicates or ignore them
 
         """
         self.idx_storage.content_ctags_add(
             results, conflict_update=(policy_update == 'update-dups'))
diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
index 37522b9..2e984e4 100644
--- a/swh/indexer/fossology_license.py
+++ b/swh/indexer/fossology_license.py
@@ -1,172 +1,186 @@
 # Copyright (C) 2016-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-import click
 import subprocess
 
 from swh.model import hashutil
 
 from .indexer import ContentIndexer, ContentRangeIndexer, DiskIndexer
 
 
+def compute_license(path, log=None):
+    """Determine license from file at path.
+
+    Args:
+        path: filepath to determine the license
+
+    Returns:
+        A dict with the following keys:
+        - licenses ([str]): associated detected licenses to path
+        - path (bytes): content filepath
+
+    """
+    try:
+        properties = subprocess.check_output(['nomossa', path],
+                                             universal_newlines=True)
+        if properties:
+            res = properties.rstrip().split(' contains license(s) ')
+            licenses = res[1].split(',')
+        else:
+            licenses = []
+
+        return {
+            'licenses': licenses,
+            'path': path,
+        }
+    except subprocess.CalledProcessError:
+        if log:
+            from os import path as __path
+            log.exception('Problem during license detection for sha1 %s' %
+                          __path.basename(path))
+        return {
+            'licenses': [],
+            'path': path,
+        }
+
+
 class MixinFossologyLicenseIndexer:
     """Mixin fossology license indexer.
 
     See :class:`ContentFossologyLicenseIndexer` and
     :class:`FossologyLicenseRangeIndexer`
 
     """
     ADDITIONAL_CONFIG = {
         'workdir': ('str', '/tmp/swh/indexer.fossology.license'),
         'tools': ('dict', {
             'name': 'nomos',
             'version': '3.1.0rc2-31-ga2cbb8c',
             'configuration': {
                 'command_line': 'nomossa <filepath>',
             },
         }),
         'write_batch_size': ('int', 1000),
     }
 
     CONFIG_BASE_FILENAME = 'indexer/fossology_license'
 
     def prepare(self):
         super().prepare()
         self.working_directory = self.config['workdir']
         self.tool = self.tools[0]
 
     def compute_license(self, path, log=None):
         """Determine license from file at path.
 
         Args:
             path: filepath to determine the license
 
         Returns:
             A dict with the following keys:
             - licenses ([str]): associated detected licenses to path
             - path (bytes): content filepath
-            - tool (str): tool used to compute the output
 
         """
-        try:
-            properties = subprocess.check_output(['nomossa', path],
-                                                 universal_newlines=True)
-            if properties:
-                res = properties.rstrip().split(' contains license(s) ')
-                licenses = res[1].split(',')
-
-                return {
-                    'licenses': licenses,
-                    'path': path,
-                }
-        except subprocess.CalledProcessError:
-            if log:
-                from os import path as __path
-                log.exception('Problem during license detection for sha1 %s' %
-                              __path.basename(path))
-            return {
-                'licenses': [],
-                'path': path,
-            }
+        return compute_license(path, log=log)
 
     def index(self, id, data):
         """Index sha1s' content and store result.
 
         Args:
             id (bytes): content's identifier
-            raw_content (bytes): raw content in bytes
+            raw_content (bytes): associated raw content to content id
 
         Returns:
             A dict, representing a content_license, with keys:
               - id (bytes): content's identifier (sha1)
               - license (bytes): license in bytes
               - path (bytes): path
+              - indexer_configuration_id (int): tool used to compute the output
 
         """
-        if isinstance(id, str):
-            id = hashutil.hash_to_hex(id)
         content_path = self.write_to_temp(
-            filename=id,
+            filename=hashutil.hash_to_hex(id),  # use the id as pathname
             data=data)
 
         try:
             properties = self.compute_license(path=content_path, log=self.log)
             properties.update({
                 'id': id,
                 'indexer_configuration_id': self.tool['id'],
             })
         finally:
             self.cleanup(content_path)
 
         return properties
 
     def persist_index_computations(self, results, policy_update):
         """Persist the results in storage.
 
         Args:
             results ([dict]): list of content_license, dict with the
             following keys:
               - id (bytes): content's identifier (sha1)
               - license (bytes): license in bytes
               - path (bytes): path
             policy_update ([str]): either 'update-dups' or 'ignore-dups' to
             respectively update duplicates or ignore them
 
         """
         self.idx_storage.content_fossology_license_add(
             results, conflict_update=(policy_update == 'update-dups'))
 
 
 class ContentFossologyLicenseIndexer(
         MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer):
     """Indexer in charge of:
     - filtering out content already indexed
     - reading content from objstorage per the content's id (sha1)
     - computing {license, encoding} from that content
     - store result in storage
 
     """
     def filter(self, ids):
         """Filter out known sha1s and return only missing ones.
 
         """
         yield from self.idx_storage.content_fossology_license_missing((
             {
                 'id': sha1,
                 'indexer_configuration_id': self.tool['id'],
             } for sha1 in ids
         ))
 
 
 class FossologyLicenseRangeIndexer(
         MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer):
     """FossologyLicense Range Indexer working on range of content identifiers.
 
     It:
     - filters out the non textual content
     - (optionally) filters out content already indexed (cf :callable:`range`)
     - reads content from objstorage per the content's id (sha1)
     - computes {mimetype, encoding} from that content
     - stores result in storage
 
     """
     def indexed_contents_in_range(self, start, end):
         """Retrieve indexed content id within range [start, end].
 
         Args
             **start** (bytes): Starting bound from range identifier
             **end** (bytes): End range identifier
 
         Yields:
             Content identifier (bytes) present in the range [start, end]
 
         """
         while start:
             result = self.idx_storage.content_fossology_license_get_range(
                 start, end, self.tool['id'])
             contents = result['ids']
             for _id in contents:
                 yield _id
             start = result['next']
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
index 1136d8d..5d8dd8d 100644
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -1,587 +1,586 @@
 # Copyright (C) 2016-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import abc
 import os
 import logging
 import shutil
 import tempfile
 import datetime
 from copy import deepcopy
 
 from swh.scheduler import get_scheduler
 from swh.storage import get_storage
 from swh.core.config import SWHConfig
 from swh.objstorage import get_objstorage
 from swh.objstorage.exc import ObjNotFoundError
 from swh.indexer.storage import get_indexer_storage, INDEXER_CFG_KEY
 from swh.model import hashutil
 from swh.core import utils
 
 
 class DiskIndexer:
     """Mixin intended to be used with other SomethingIndexer classes.
 
        Indexers inheriting from this class are a category of indexers
        which needs the disk for their computations.
 
        Note:
            This expects `self.working_directory` variable defined at
            runtime.
 
     """
     def write_to_temp(self, filename, data):
         """Write the sha1's content in a temporary file.
 
         Args:
-            sha1 (str): the sha1 name
             filename (str): one of sha1's many filenames
             data (bytes): the sha1's content to write in temporary
             file
 
         Returns:
             The path to the temporary file created. That file is
             filled in with the raw content's data.
 
         """
         os.makedirs(self.working_directory, exist_ok=True)
         temp_dir = tempfile.mkdtemp(dir=self.working_directory)
         content_path = os.path.join(temp_dir, filename)
 
         with open(content_path, 'wb') as f:
             f.write(data)
 
         return content_path
 
     def cleanup(self, content_path):
         """Remove content_path from working directory.
 
         Args:
             content_path (str): the file to remove
 
         """
         temp_dir = os.path.dirname(content_path)
         shutil.rmtree(temp_dir)
 
 
 class BaseIndexer(SWHConfig, metaclass=abc.ABCMeta):
     """Base class for indexers to inherit from.
 
     The main entry point is the :func:`run` function which is in
     charge of triggering the computations on the batch dict/ids
     received.
 
     Indexers can:
 
     - filter out ids whose data has already been indexed.
     - retrieve ids data from storage or objstorage
     - index this data depending on the object and store the result in
       storage.
 
     To implement a new object type indexer, inherit from the
     BaseIndexer and implement indexing:
 
     :func:`run`:
       object_ids are different depending on object. For example: sha1 for
       content, sha1_git for revision, directory, release, and id for origin
 
     To implement a new concrete indexer, inherit from the object level
     classes: :class:`ContentIndexer`, :class:`RevisionIndexer`,
     :class:`OriginIndexer`.
 
     Then you need to implement the following functions:
 
     :func:`filter`:
       filter out data already indexed (in storage).
 
     :func:`index_object`:
       compute index on id with data (retrieved from the storage or the
       objstorage by the id key) and return the resulting index computation.
 
     :func:`persist_index_computations`:
       persist the results of multiple index computations in the storage.
 
     The new indexer implementation can also override the following functions:
 
     :func:`prepare`:
       Configuration preparation for the indexer.  When overriding, this must
       call the `super().prepare()` instruction.
 
     :func:`check`:
       Configuration check for the indexer.  When overriding, this must call the
       `super().check()` instruction.
 
     :func:`register_tools`:
       This should return a dict of the tool(s) to use when indexing or
       filtering.
 
     """
     CONFIG = 'indexer/base'
 
     DEFAULT_CONFIG = {
         INDEXER_CFG_KEY: ('dict', {
             'cls': 'remote',
             'args': {
                 'url': 'http://localhost:5007/'
             }
         }),
         'storage': ('dict', {
             'cls': 'remote',
             'args': {
                 'url': 'http://localhost:5002/',
             }
         }),
         'objstorage': ('dict', {
             'cls': 'remote',
             'args': {
                 'url': 'http://localhost:5003/',
             }
         })
     }
 
     ADDITIONAL_CONFIG = {}
 
     def __init__(self):
         """Prepare and check that the indexer is ready to run.
 
         """
         super().__init__()
         self.prepare()
         self.check()
 
     def prepare(self):
         """Prepare the indexer's needed runtime configuration.
            Without this step, the indexer cannot possibly run.
 
         """
         self.config = self.parse_config_file(
             additional_configs=[self.ADDITIONAL_CONFIG])
         if self.config['storage']:
             self.storage = get_storage(**self.config['storage'])
         objstorage = self.config['objstorage']
         self.objstorage = get_objstorage(objstorage['cls'], objstorage['args'])
         idx_storage = self.config[INDEXER_CFG_KEY]
         self.idx_storage = get_indexer_storage(**idx_storage)
 
         _log = logging.getLogger('requests.packages.urllib3.connectionpool')
         _log.setLevel(logging.WARN)
         self.log = logging.getLogger('swh.indexer')
         self.tools = list(self.register_tools(self.config['tools']))
 
     def check(self):
         """Check the indexer's configuration is ok before proceeding.
            If ok, does nothing. If not raise error.
 
         """
         if not self.tools:
             raise ValueError('Tools %s is unknown, cannot continue' %
                              self.tools)
 
     def _prepare_tool(self, tool):
         """Prepare the tool dict to be compliant with the storage api.
 
         """
         return {'tool_%s' % key: value for key, value in tool.items()}
 
     def register_tools(self, tools):
         """Permit to register tools to the storage.
 
            Add a sensible default which can be overridden if not
            sufficient.  (For now, all indexers use only one tool)
 
            Expects the self.config['tools'] property to be set with
            one or more tools.
 
         Args:
             tools (dict/[dict]): Either a dict or a list of dict.
 
         Returns:
             List of dict with additional id key.
 
         Raises:
             ValueError if not a list nor a dict.
 
         """
         tools = self.config['tools']
         if isinstance(tools, list):
             tools = map(self._prepare_tool, tools)
         elif isinstance(tools, dict):
             tools = [self._prepare_tool(tools)]
         else:
             raise ValueError('Configuration tool(s) must be a dict or list!')
 
         return self.idx_storage.indexer_configuration_add(tools)
 
     @abc.abstractmethod
     def index(self, id, data):
         """Index computation for the id and associated raw data.
 
         Args:
             id (bytes): identifier
             data (bytes): id's data from storage or objstorage depending on
                              object type
 
         Returns:
             a dict that makes sense for the persist_index_computations
         function.
 
         """
         pass
 
     @abc.abstractmethod
     def persist_index_computations(self, results, policy_update):
         """Persist the computation resulting from the index.
 
         Args:
 
             results ([result]): List of results. One result is the
                                 result of the index function.
             policy_update ([str]): either 'update-dups' or 'ignore-dups' to
                                    respectively update duplicates or ignore
                                    them
 
         Returns:
             None
 
         """
         pass
 
     def next_step(self, results, task):
         """Do something else with computations results (e.g. send to another
         queue, ...).
 
         (This is not an abstractmethod since it is optional).
 
         Args:
             results ([result]): List of results (dict) as returned
                                 by index function.
             task (dict): a dict in the form expected by
                         `scheduler.backend.SchedulerBackend.create_tasks`
                         without `next_run`, plus a `result_name` key.
 
         Returns:
             None
 
         """
         if task:
             if getattr(self, 'scheduler', None):
                 scheduler = self.scheduler
             else:
                 scheduler = get_scheduler(**self.config['scheduler'])
             task = deepcopy(task)
             result_name = task.pop('result_name')
             task['next_run'] = datetime.datetime.now()
             task['arguments']['kwargs'][result_name] = self.results
             scheduler.create_tasks([task])
 
     @abc.abstractmethod
     def run(self, ids, policy_update,
             next_step=None, **kwargs):
         """Given a list of ids:
 
         - retrieves the data from the storage
         - executes the indexing computations
         - stores the results (according to policy_update)
 
         Args:
             ids ([bytes]): id's identifier list
             policy_update (str): either 'update-dups' or 'ignore-dups' to
             respectively update duplicates or ignore them
             next_step (dict): a dict in the form expected by
                         `scheduler.backend.SchedulerBackend.create_tasks`
                         without `next_run`, plus a `result_name` key.
             **kwargs: passed to the `index` method
 
         """
         pass
 
 
 class ContentIndexer(BaseIndexer):
     """A content indexer working on a list of ids directly.
 
     To work on indexer range, use the :class:`ContentRangeIndexer`
     instead.
 
     Note: :class:`ContentIndexer` is not an instantiable object. To
     use it, one should inherit from this class and override the
     methods mentioned in the :class:`BaseIndexer` class.
 
     """
     @abc.abstractmethod
     def filter(self, ids):
         """Filter missing ids for that particular indexer.
 
         Args:
             ids ([bytes]): list of ids
 
         Yields:
             iterator of missing ids
 
         """
         pass
 
     def run(self, ids, policy_update,
             next_step=None, **kwargs):
         """Given a list of ids:
 
         - retrieve the content from the storage
         - execute the indexing computations
         - store the results (according to policy_update)
 
         Args:
             ids ([bytes]): sha1's identifier list
             policy_update (str): either 'update-dups' or 'ignore-dups' to
                                  respectively update duplicates or ignore
                                  them
             next_step (dict): a dict in the form expected by
                         `scheduler.backend.SchedulerBackend.create_tasks`
                         without `next_run`, plus a `result_name` key.
             **kwargs: passed to the `index` method
 
         """
         results = []
         try:
             for sha1 in ids:
                 try:
                     raw_content = self.objstorage.get(sha1)
                 except ObjNotFoundError:
                     self.log.warning('Content %s not found in objstorage' %
                                      hashutil.hash_to_hex(sha1))
                     continue
                 res = self.index(sha1, raw_content, **kwargs)
                 if res:  # If no results, skip it
                     results.append(res)
 
             self.persist_index_computations(results, policy_update)
             self.results = results
             return self.next_step(results, task=next_step)
         except Exception:
             self.log.exception(
                 'Problem when reading contents metadata.')
 
 
 class ContentRangeIndexer(BaseIndexer):
     """A content range indexer.
 
     This expects as input a range of ids to index.
 
     To work on a list of ids, use the :class:`ContentIndexer` instead.
 
     Note: :class:`ContentRangeIndexer` is not an instantiable
     object. To use it, one should inherit from this class and override
     the methods mentioned in the :class:`BaseIndexer` class.
 
     """
     @abc.abstractmethod
     def indexed_contents_in_range(self, start, end):
         """Retrieve indexed contents within range [start, end].
 
         Args
             **start** (bytes): Starting bound from range identifier
             **end** (bytes): End range identifier
 
         Yields:
             Content identifier (bytes) present in the range [start, end]
 
         """
         pass
 
     def _list_contents_to_index(self, start, end, indexed):
         """Compute from storage the new contents to index in the range [start,
            end]. The already indexed contents are skipped.
 
         Args:
             **start** (bytes): Starting bound from range identifier
             **end** (bytes): End range identifier
             **indexed** (Set[bytes]): Set of content already indexed.
 
         Yields:
             Identifier (bytes) of contents to index.
 
         """
         while start:
             result = self.storage.content_get_range(start, end)
             contents = result['contents']
             for c in contents:
                 _id = c['sha1']
                 if _id in indexed:
                     continue
                 yield _id
             start = result['next']
 
     def _index_contents(self, start, end, indexed, **kwargs):
         """Index the contents from within range [start, end]
 
         Args:
             **start** (bytes): Starting bound from range identifier
             **end** (bytes): End range identifier
             **indexed** (Set[bytes]): Set of content already indexed.
 
         Yields:
             Data indexed (dict) to persist using the indexer storage
 
         """
         for sha1 in self._list_contents_to_index(start, end, indexed):
             try:
                 raw_content = self.objstorage.get(sha1)
             except ObjNotFoundError:
                 self.log.warning('Content %s not found in objstorage' %
                                  hashutil.hash_to_hex(sha1))
                 continue
             res = self.index(sha1, raw_content, **kwargs)
             if res:
                 yield res
 
     def run(self, start, end, skip_existing=True, **kwargs):
         """Given a range of content ids, compute the indexing computations on
            the contents within. Either the indexer is incremental
            (filter out existing computed data) or not (compute
            everything from scratch).
 
         Args:
             **start** (Union[bytes, str]): Starting range identifier
             **end** (Union[bytes, str]): Ending range identifier
             **skip_existing** (bool): Skip existing indexed data
                                      (default) or not
             **kwargs: passed to the `index` method
 
         Returns:
             a boolean. True if data was indexed, False otherwise.
 
         """
         with_indexed_data = False
         try:
             if isinstance(start, str):
                 start = hashutil.hash_to_bytes(start)
             if isinstance(end, str):
                 end = hashutil.hash_to_bytes(end)
 
             if skip_existing:
                 indexed = set(self.indexed_contents_in_range(start, end))
             else:
                 indexed = set()
 
             index_computations = self._index_contents(start, end, indexed)
             for results in utils.grouper(index_computations,
                                          n=self.config['write_batch_size']):
                 self.persist_index_computations(
                     results, policy_update='update-dups')
                 with_indexed_data = True
             return with_indexed_data
         except Exception:
             self.log.exception(
                 'Problem when computing metadata.')
 
 
 class OriginIndexer(BaseIndexer):
     """An object type indexer, inherits from the :class:`BaseIndexer` and
     implements Origin indexing using the run method
 
     Note: the :class:`OriginIndexer` is not an instantiable object.
     To use it in another context one should inherit from this class
     and override the methods mentioned in the :class:`BaseIndexer`
     class.
 
     """
     def run(self, ids, policy_update,
             parse_ids=False, next_step=None, **kwargs):
         """Given a list of origin ids:
 
         - retrieve origins from storage
         - execute the indexing computations
         - store the results (according to policy_update)
 
         Args:
             ids ([Union[int, Tuple[str, bytes]]]): list of origin ids or
                                                    (type, url) tuples.
             policy_update (str): either 'update-dups' or 'ignore-dups' to
                                    respectively update duplicates or ignore
                                    them
             parse_ids (bool: If `True`, will try to convert `ids`
                                from a human input to the valid type.
             next_step (dict): a dict in the form expected by
                         `scheduler.backend.SchedulerBackend.create_tasks`
                         without `next_run`, plus a `result_name` key.
             **kwargs: passed to the `index` method
 
         """
         if parse_ids:
             ids = [
                     o.split('+', 1) if ':' in o else int(o)  # type+url or id
                     for o in ids]
 
         results = []
 
         for id_ in ids:
             if isinstance(id_, (tuple, list)):
                 if len(id_) != 2:
                     raise TypeError('Expected a (type, url) tuple.')
                 (type_, url) = id_
                 params = {'type': type_, 'url': url}
             elif isinstance(id_, int):
                 params = {'id': id_}
             else:
                 raise TypeError('Invalid value in "ids": %r' % id_)
             origin = self.storage.origin_get(params)
             if not origin:
                 self.log.warning('Origins %s not found in storage' %
                                  list(ids))
                 continue
             try:
                 res = self.index(origin, **kwargs)
                 if origin:  # If no results, skip it
                     results.append(res)
             except Exception:
                 self.log.exception(
                         'Problem when processing origin %s' % id_)
         self.persist_index_computations(results, policy_update)
         self.results = results
         return self.next_step(results, task=next_step)
 
 
 class RevisionIndexer(BaseIndexer):
     """An object type indexer, inherits from the :class:`BaseIndexer` and
     implements Revision indexing using the run method
 
     Note: the :class:`RevisionIndexer` is not an instantiable object.
     To use it in another context one should inherit from this class
     and override the methods mentioned in the :class:`BaseIndexer`
     class.
 
     """
     def run(self, ids, policy_update, next_step=None):
         """Given a list of sha1_gits:
 
         - retrieve revisions from storage
         - execute the indexing computations
         - store the results (according to policy_update)
 
         Args:
             ids ([bytes or str]): sha1_git's identifier list
             policy_update (str): either 'update-dups' or 'ignore-dups' to
                                  respectively update duplicates or ignore
                                  them
 
         """
         results = []
         ids = [id_.encode() if isinstance(id_, str) else id_
                for id_ in ids]
         revs = self.storage.revision_get(ids)
 
         for rev in revs:
             if not rev:
                 self.log.warning('Revisions %s not found in storage' %
                                  list(map(hashutil.hash_to_hex, ids)))
                 continue
             try:
                 res = self.index(rev)
                 if res:  # If no results, skip it
                     results.append(res)
             except Exception:
                 self.log.exception(
                         'Problem when processing revision')
         self.persist_index_computations(results, policy_update)
         self.results = results
         return self.next_step(results, task=next_step)
diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py
index 7dd43af..a17a09c 100644
--- a/swh/indexer/mimetype.py
+++ b/swh/indexer/mimetype.py
@@ -1,156 +1,155 @@
 # Copyright (C) 2016-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
-import click
 import magic
 
 from swh.model import hashutil
 
 from .indexer import ContentIndexer, ContentRangeIndexer
 
 
 def compute_mimetype_encoding(raw_content):
     """Determine mimetype and encoding from the raw content.
 
     Args:
         raw_content (bytes): content's raw data
 
     Returns:
         A dict with mimetype and encoding key and corresponding values
         (as bytes).
 
     """
     r = magic.detect_from_content(raw_content)
     return {
         'mimetype': r.mime_type.encode('utf-8'),
         'encoding': r.encoding.encode('utf-8'),
     }
 
 
 class MixinMimetypeIndexer:
     """Mixin mimetype indexer.
 
     See :class:`ContentMimetypeIndexer` and :class:`MimetypeRangeIndexer`
 
     """
     ADDITIONAL_CONFIG = {
         'tools': ('dict', {
             'name': 'file',
             'version': '1:5.30-1+deb9u1',
             'configuration': {
                 "type": "library",
                 "debian-package": "python3-magic"
             },
         }),
         'write_batch_size': ('int', 1000),
     }
 
     CONFIG_BASE_FILENAME = 'indexer/mimetype'
 
     def prepare(self):
         super().prepare()
         self.tool = self.tools[0]
 
     def index(self, id, data):
         """Index sha1s' content and store result.
 
         Args:
             id (bytes): content's identifier
             data (bytes): raw content in bytes
 
         Returns:
             A dict, representing a content_mimetype, with keys:
 
               - id (bytes): content's identifier (sha1)
               - mimetype (bytes): mimetype in bytes
               - encoding (bytes): encoding in bytes
 
         """
         try:
             properties = compute_mimetype_encoding(data)
             properties.update({
                 'id': id,
                 'indexer_configuration_id': self.tool['id'],
                 })
         except TypeError:
             self.log.error('Detecting mimetype error for id %s' % (
                 hashutil.hash_to_hex(id), ))
             return None
 
         return properties
 
     def persist_index_computations(self, results, policy_update):
         """Persist the results in storage.
 
         Args:
             results ([dict]): list of content_mimetype, dict with the
             following keys:
 
               - id (bytes): content's identifier (sha1)
               - mimetype (bytes): mimetype in bytes
               - encoding (bytes): encoding in bytes
 
             policy_update ([str]): either 'update-dups' or 'ignore-dups' to
             respectively update duplicates or ignore them
 
         """
         self.idx_storage.content_mimetype_add(
             results, conflict_update=(policy_update == 'update-dups'))
 
 
 class ContentMimetypeIndexer(MixinMimetypeIndexer, ContentIndexer):
     """Mimetype Indexer working on list of content identifiers.
 
     It:
     - (optionally) filters out content already indexed (cf. :callable:`filter`)
     - reads content from objstorage per the content's id (sha1)
     - computes {mimetype, encoding} from that content
     - stores result in storage
 
     FIXME:
     - 1. Rename redundant ContentMimetypeIndexer to MimetypeIndexer
     - 2. Do we keep it afterwards? ~> i think this can be used with the journal
 
     """
     def filter(self, ids):
         """Filter out known sha1s and return only missing ones.
 
         """
         yield from self.idx_storage.content_mimetype_missing((
             {
                 'id': sha1,
                 'indexer_configuration_id': self.tool['id'],
             } for sha1 in ids
         ))
 
 
 class MimetypeRangeIndexer(MixinMimetypeIndexer, ContentRangeIndexer):
     """Mimetype Range Indexer working on range of content identifiers.
 
     It:
     - (optionally) filters out content already indexed (cf :callable:`range`)
     - reads content from objstorage per the content's id (sha1)
     - computes {mimetype, encoding} from that content
     - stores result in storage
 
     """
     def indexed_contents_in_range(self, start, end):
         """Retrieve indexed content id within range [start, end].
 
         Args
             **start** (bytes): Starting bound from range identifier
             **end** (bytes): End range identifier
 
         Yields:
             Content identifier (bytes) present in the range [start, end]
 
         """
         while start:
             result = self.idx_storage.content_mimetype_get_range(
                 start, end, self.tool['id'])
             contents = result['ids']
             for _id in contents:
                 yield _id
             start = result['next']
diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py
index 10ad15f..fdcf515 100644
--- a/swh/indexer/tests/test_fossology_license.py
+++ b/swh/indexer/tests/test_fossology_license.py
@@ -1,172 +1,200 @@
 # Copyright (C) 2017-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import unittest
 import logging
 
+from unittest.mock import patch
+
 from swh.indexer.fossology_license import (
-    ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer
+    ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer,
+    compute_license
 )
 
 from swh.indexer.tests.test_utils import (
     MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
     SHA1_TO_LICENSES, CommonContentIndexerTest, CommonContentIndexerRangeTest,
     CommonIndexerWithErrorsTest, CommonIndexerNoTool, NoDiskIndexer
 )
 
 
+class BasicTest(unittest.TestCase):
+    @patch('swh.indexer.fossology_license.subprocess')
+    def test_compute_license(self, mock_subprocess):
+        """Computing licenses from a raw content should return results
+
+        """
+        for path, intermediary_result, output in [
+                (b'some/path', None,
+                 []),
+                (b'some/path/2', [],
+                 []),
+                (b'other/path', ' contains license(s) GPL,AGPL',
+                 ['GPL', 'AGPL'])]:
+            mock_subprocess.check_output.return_value = intermediary_result
+
+            actual_result = compute_license(path, log=None)
+
+            self.assertEqual(actual_result, {
+                'licenses': output,
+                'path': path,
+            })
+
+
 class InjectLicenseIndexer:
     """Override license computations.
 
     """
     def compute_license(self, path, log=None):
         """path is the content identifier
 
         """
+        if isinstance(id, bytes):
+            path = path.decode('utf-8')
         return {
             'licenses': SHA1_TO_LICENSES.get(path)
         }
 
 
 class FossologyLicenseTestIndexer(
         NoDiskIndexer, InjectLicenseIndexer, ContentFossologyLicenseIndexer):
     """Specific fossology license whose configuration is enough to satisfy
        the indexing checks.
 
     """
     def prepare(self):
         self.config = {
             'tools': {
                 'name': 'nomos',
                 'version': '3.1.0rc2-31-ga2cbb8c',
                 'configuration': {
                     'command_line': 'nomossa <filepath>',
                 },
             },
         }
         self.idx_storage = BasicMockIndexerStorage()
         self.log = logging.getLogger('swh.indexer')
         self.objstorage = MockObjStorage()
         self.tools = self.register_tools(self.config['tools'])
         self.tool = self.tools[0]
 
 
 class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase):
     """Language indexer test scenarios:
 
     - Known sha1s in the input list have their data indexed
     - Unknown sha1 in the input list are not indexed
 
     """
     def setUp(self):
         self.indexer = FossologyLicenseTestIndexer()
 
         self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
         self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
         self.id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'  # empty content
         tool_id = self.indexer.tool['id']
         # then
         self.expected_results = {
             self.id0: {
                 'id': self.id0,
                 'indexer_configuration_id': tool_id,
                 'licenses': SHA1_TO_LICENSES[self.id0],
             },
             self.id1: {
                 'id': self.id1,
                 'indexer_configuration_id': tool_id,
                 'licenses': SHA1_TO_LICENSES[self.id1],
             },
             self.id2: {
                 'id': self.id2,
                 'indexer_configuration_id': tool_id,
                 'licenses': SHA1_TO_LICENSES[self.id2],
             }
         }
 
 
 class FossologyLicenseRangeIndexerTest(
         NoDiskIndexer, InjectLicenseIndexer, FossologyLicenseRangeIndexer):
     """Testing the range indexer on fossology license.
 
     """
     def prepare(self):
         self.config = {
             'tools': {
                 'name': 'nomos',
                 'version': '3.1.0rc2-31-ga2cbb8c',
                 'configuration': {
                     'command_line': 'nomossa <filepath>',
                 },
             },
             'write_batch_size': 100,
         }
         self.idx_storage = BasicMockIndexerStorage()
         self.log = logging.getLogger('swh.indexer')
         # this hardcodes some contents, will use this to setup the storage
         self.objstorage = MockObjStorage()
         # sync objstorage and storage
         contents = [{'sha1': c_id} for c_id in self.objstorage]
         self.storage = BasicMockStorage(contents)
         self.tools = self.register_tools(self.config['tools'])
         self.tool = self.tools[0]
 
 
 class TestFossologyLicenseRangeIndexer(
         CommonContentIndexerRangeTest, unittest.TestCase):
     """Range Fossology License Indexer tests.
 
     - new data within range are indexed
     - no data outside a range are indexed
     - with filtering existing indexed data prior to compute new index
     - without filtering existing indexed data prior to compute new index
 
     """
     def setUp(self):
         self.indexer = FossologyLicenseRangeIndexerTest()
         # will play along with the objstorage's mocked contents for now
         self.contents = sorted(self.indexer.objstorage)
         # FIXME: leverage swh.objstorage.in_memory_storage's
         # InMemoryObjStorage, swh.storage.tests's gen_contents, and
         # hypothesis to generate data to actually run indexer on those
 
         self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
         self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
         self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6'
         tool_id = self.indexer.tool['id']
         self.expected_results = {
             self.id0: {
                 'id': self.id0,
                 'indexer_configuration_id': tool_id,
                 'licenses': SHA1_TO_LICENSES[self.id0]
             },
             self.id1: {
                 'id': self.id1,
                 'indexer_configuration_id': tool_id,
                 'licenses': SHA1_TO_LICENSES[self.id1]
             },
             self.id2: {
                 'id': self.id2,
                 'indexer_configuration_id': tool_id,
                 'licenses': SHA1_TO_LICENSES[self.id2]
             }
         }
 
 
 class FossologyLicenseIndexerUnknownToolTestStorage(
         CommonIndexerNoTool, FossologyLicenseTestIndexer):
     """Fossology license indexer with wrong configuration"""
 
 
 class FossologyLicenseRangeIndexerUnknownToolTestStorage(
         CommonIndexerNoTool, FossologyLicenseRangeIndexerTest):
     """Fossology license range indexer with wrong configuration"""
 
 
 class TestFossologyLicenseIndexersErrors(
         CommonIndexerWithErrorsTest, unittest.TestCase):
     """Test the indexer raise the right errors when wrongly initialized"""
     Indexer = FossologyLicenseIndexerUnknownToolTestStorage
     RangeIndexer = FossologyLicenseRangeIndexerUnknownToolTestStorage
diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py
index 70d2e1d..c3aee8e 100644
--- a/swh/indexer/tests/test_mimetype.py
+++ b/swh/indexer/tests/test_mimetype.py
@@ -1,163 +1,188 @@
 # Copyright (C) 2017-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 import unittest
 import logging
 
+from unittest.mock import patch
+
 from swh.indexer.mimetype import (
-    ContentMimetypeIndexer, MimetypeRangeIndexer
+    ContentMimetypeIndexer, MimetypeRangeIndexer, compute_mimetype_encoding
 )
 
 from swh.indexer.tests.test_utils import (
     MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
     CommonContentIndexerTest, CommonContentIndexerRangeTest,
     CommonIndexerWithErrorsTest, CommonIndexerNoTool
 )
 
 
+class FakeMagicResult:
+    def __init__(self, mimetype, encoding):
+        self.mime_type = mimetype
+        self.encoding = encoding
+
+
+class BasicTest(unittest.TestCase):
+    @patch('swh.indexer.mimetype.magic')
+    def test_compute_mimetype_encoding(self, mock_magic):
+        """Compute mimetype encoding should return results"""
+        for _input, _mimetype, _encoding in [
+                (b'some-content', 'text/plain', 'utf-8'),
+                (b'raw-content', 'application/json', 'ascii')]:
+            mock_magic.detect_from_content.return_value = FakeMagicResult(
+                _mimetype, _encoding)
+
+            actual_result = compute_mimetype_encoding(_input)
+            self.assertEqual(actual_result, {
+                'mimetype': _mimetype.encode('utf-8'),
+                'encoding': _encoding.encode('utf-8'),
+            })
+
+
 class MimetypeTestIndexer(ContentMimetypeIndexer):
     """Specific mimetype indexer instance whose configuration is enough to
        satisfy the indexing tests.
 
     """
     def prepare(self):
         self.config = {
             'tools': {
                 'name': 'file',
                 'version': '1:5.30-1+deb9u1',
                 'configuration': {
                     "type": "library",
                     "debian-package": "python3-magic"
                 },
             },
         }
         self.idx_storage = BasicMockIndexerStorage()
         self.log = logging.getLogger('swh.indexer')
         self.objstorage = MockObjStorage()
         self.tools = self.register_tools(self.config['tools'])
         self.tool = self.tools[0]
 
 
 class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase):
     """Mimetype indexer test scenarios:
 
     - Known sha1s in the input list have their data indexed
     - Unknown sha1 in the input list are not indexed
 
     """
     def setUp(self):
         self.indexer = MimetypeTestIndexer()
 
         self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
         self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
         self.id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
         tool_id = self.indexer.tool['id']
         self.expected_results = {
             self.id0: {
                 'id': self.id0,
                 'indexer_configuration_id': tool_id,
                 'mimetype': b'text/plain',
                 'encoding': b'us-ascii',
             },
             self.id1: {
                 'id': self.id1,
                 'indexer_configuration_id': tool_id,
                 'mimetype': b'text/plain',
                 'encoding': b'us-ascii',
             },
             self.id2: {
                 'id': self.id2,
                 'indexer_configuration_id': tool_id,
                 'mimetype': b'application/x-empty',
                 'encoding': b'binary',
             }
         }
 
 
 class MimetypeRangeIndexerTest(MimetypeRangeIndexer):
     """Specific mimetype whose configuration is enough to satisfy the
        indexing tests.
 
     """
     def prepare(self):
         self.config = {
             'tools': {
                 'name': 'file',
                 'version': '1:5.30-1+deb9u1',
                 'configuration': {
                     "type": "library",
                     "debian-package": "python3-magic"
                 },
             },
             'write_batch_size': 100,
         }
         self.idx_storage = BasicMockIndexerStorage()
         self.log = logging.getLogger('swh.indexer')
         # this hardcodes some contents, will use this to setup the storage
         self.objstorage = MockObjStorage()
         # sync objstorage and storage
         contents = [{'sha1': c_id} for c_id in self.objstorage]
         self.storage = BasicMockStorage(contents)
         self.tools = self.register_tools(self.config['tools'])
         self.tool = self.tools[0]
 
 
 class TestMimetypeRangeIndexer(
         CommonContentIndexerRangeTest, unittest.TestCase):
     """Range Mimetype Indexer tests.
 
     - new data within range are indexed
     - no data outside a range are indexed
     - with filtering existing indexed data prior to compute new index
     - without filtering existing indexed data prior to compute new index
 
     """
     def setUp(self):
         self.indexer = MimetypeRangeIndexerTest()
         # will play along with the objstorage's mocked contents for now
         self.contents = sorted(self.indexer.objstorage)
         # FIXME: leverage swh.objstorage.in_memory_storage's
         # InMemoryObjStorage, swh.storage.tests's gen_contents, and
         # hypothesis to generate data to actually run indexer on those
 
         self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
         self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
         self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6'
         tool_id = self.indexer.tool['id']
 
         self.expected_results = {
             self.id0: {
                 'encoding': b'us-ascii',
                 'id': self.id0,
                 'indexer_configuration_id': tool_id,
                 'mimetype': b'text/plain'},
             self.id1: {
                 'encoding': b'us-ascii',
                 'id': self.id1,
                 'indexer_configuration_id': tool_id,
                 'mimetype': b'text/x-python'},
             self.id2: {
                 'encoding': b'us-ascii',
                 'id': self.id2,
                 'indexer_configuration_id': tool_id,
                 'mimetype': b'text/plain'}
         }
 
 
 class MimetypeIndexerUnknownToolTestStorage(
         CommonIndexerNoTool, MimetypeTestIndexer):
     """Fossology license indexer with wrong configuration"""
 
 
 class MimetypeRangeIndexerUnknownToolTestStorage(
         CommonIndexerNoTool, MimetypeRangeIndexerTest):
     """Fossology license range indexer with wrong configuration"""
 
 
 class TestMimetypeIndexersErrors(
         CommonIndexerWithErrorsTest, unittest.TestCase):
     """Test the indexer raise the right errors when wrongly initialized"""
     Indexer = MimetypeIndexerUnknownToolTestStorage
     RangeIndexer = MimetypeRangeIndexerUnknownToolTestStorage
diff --git a/version.txt b/version.txt
index 1e222fc..d1c0402 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-v0.0.58-0-ga8546bc
\ No newline at end of file
+v0.0.59-0-g45c8f94
\ No newline at end of file