diff --git a/PKG-INFO b/PKG-INFO index 860a729..ee4435e 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,71 +1,71 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 2.4.4 +Version: 2.5.0 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. diff --git a/debian/changelog b/debian/changelog index 6a146e8..84423d5 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,1541 +1,1545 @@ -swh-indexer (2.4.4-1~swh1~bpo10+1) buster-swh; urgency=medium +swh-indexer (2.5.0-1~swh1) unstable-swh; urgency=medium - * Rebuild for buster-swh + * New upstream release 2.5.0 - (tagged by Antoine R. Dumont + (@ardumont) on 2022-08-31 18:10:38 + +0200) + * Upstream changes: - v2.5.0 - indexer.cli: Allow batch_size + configuration on journal client - -- Software Heritage autobuilder (on jenkins-debian1) Wed, 31 Aug 2022 12:59:40 +0000 + -- Software Heritage autobuilder (on jenkins-debian1) Wed, 31 Aug 2022 16:20:18 +0000 swh-indexer (2.4.4-1~swh1) unstable-swh; urgency=medium * New upstream release 2.4.4 - (tagged by Valentin Lorentz on 2022-08-31 11:26:51 +0200) * Upstream changes: - v2.4.4 - * Revert "metadata: Drop unsupported key 'type'" - * rehash: Call objstorage.content_get() with a HashDict instead of single hash -- Software Heritage autobuilder (on jenkins-debian1) Wed, 31 Aug 2022 09:36:21 +0000 swh-indexer (2.4.3-1~swh2) unstable-swh; urgency=medium * Drop blocking dependency constraint and bump new version. -- Antoine R. Dumont (@ardumont) Tue, 30 Aug 2022 15:37:01 +0200 swh-indexer (2.4.3-1~swh1) unstable-swh; urgency=medium * New upstream release 2.4.3 - (tagged by Antoine R. Dumont (@ardumont) on 2022-08-30 11:09:04 +0200) * Upstream changes: - v2.4.3 - metadata: Drop unsupported key 'type' -- Software Heritage autobuilder (on jenkins-debian1) Tue, 30 Aug 2022 09:25:28 +0000 swh-indexer (2.4.2-1~swh2) unstable-swh; urgency=medium * Bump new release -- Antoine R. Dumont (@ardumont) Thu, 25 Aug 2022 14:30:54 +0200 swh-indexer (2.4.2-1~swh1) unstable-swh; urgency=medium * New upstream release 2.4.2 - (tagged by Valentin Lorentz on 2022-08-25 13:24:10 +0200) * Upstream changes: - v2.4.2 - * Re-trigger Debian build -- Software Heritage autobuilder (on jenkins-debian1) Thu, 25 Aug 2022 11:33:19 +0000 swh-indexer (2.4.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.4.1 - (tagged by Valentin Lorentz on 2022-08-25 12:22:48 +0200) * Upstream changes: - v2.4.1 - * metadata_dictionary: Fix crash on null list item in an uri_field. -- Software Heritage autobuilder (on jenkins-debian1) Thu, 25 Aug 2022 10:32:04 +0000 swh-indexer (2.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.4.0 - (tagged by Valentin Lorentz on 2022-08-25 11:58:05 +0200) * Upstream changes: - v2.4.0 - * metadata_dictionary: Add mappings for "*.nuspec" files - * Refactor metadata mappings using rdflib.Graph instead of JSON-LD internally - * Other internal refactorings - * metadata_dictionary: Add mapping for SWORD/Atom with Codemeta -- Software Heritage autobuilder (on jenkins-debian1) Thu, 25 Aug 2022 10:09:34 +0000 swh-indexer (2.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.3.0 - (tagged by Valentin Lorentz on 2022-08-10 12:16:48 +0200) * Upstream changes: - v2.3.0 - * Tag Sentry events with object ids - * Fix crashes on incorrect URLs in `@id` - * Fix crash on null characters in JSON - * Fix support of old RawExtrinsicMetadata objects with no id -- Software Heritage autobuilder (on jenkins-debian1) Wed, 10 Aug 2022 10:26:27 +0000 swh-indexer (2.2.2-1~swh1) unstable-swh; urgency=medium * New upstream release 2.2.2 - (tagged by Antoine R. Dumont (@ardumont) on 2022-07-29 13:41:43 +0200) * Upstream changes: - v2.2.2 - indexer.metadata: Warn and skip incomplete entries from the journal -- Software Heritage autobuilder (on jenkins-debian1) Fri, 29 Jul 2022 11:52:13 +0000 swh-indexer (2.2.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.2.1 - (tagged by Antoine R. Dumont (@ardumont) on 2022-07-29 10:56:57 +0200) * Upstream changes: - v2.2.1 - Normalize journal client indexer type names -- Software Heritage autobuilder (on jenkins-debian1) Fri, 29 Jul 2022 09:07:15 +0000 swh-indexer (2.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.2.0 - (tagged by Antoine R. Dumont (@ardumont) on 2022-07-25 16:23:12 +0200) * Upstream changes: - v2.2.0 - cli: Add content mimetype indexer journal client support - cli: Add fossology license indexer journal client support - cli: Add extrinsic-metadata indexer journal client support - docs: Fix incorrect terminology (term -> property) - mapping: Fix inconsistent name - Drop decommissioned content indexer: ctags, language -- Software Heritage autobuilder (on jenkins-debian1) Mon, 25 Jul 2022 14:33:50 +0000 swh-indexer (2.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.1.0 - (tagged by Valentin Lorentz on 2022-07-21 10:23:44 +0200) * Upstream changes: - v2.1.0 - * DirectoryIndexer: Remove incorrect assumption on object types - * docs: Explain the indexation workflow for extrinsic metadata - * docs: Update description of the metadata workflow - * metadata_dictionary: Add mappings for pubspec.yaml - * Add extrinsic metadata indexer - * Add GitHub metadata mapping - * Refactor Mapping hierarchy - * cff: Add checks for value types -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Jul 2022 08:32:23 +0000 swh-indexer (2.0.2-1~swh1) unstable-swh; urgency=medium * New upstream release 2.0.2 - (tagged by Valentin Lorentz on 2022-06-22 12:32:41 +0200) * Upstream changes: - v2.0.2 - * Fix mypy issue with swh- journal>=1.1.0 - * cff: Ignore invalid yaml files - * npm: Add workaround for mangled package descriptions - * npm: Fix crash when npm description is not a string -- Software Heritage autobuilder (on jenkins-debian1) Wed, 22 Jun 2022 10:40:25 +0000 swh-indexer (2.0.1-1~swh1) unstable-swh; urgency=medium * New upstream release 2.0.1 - (tagged by Antoine R. Dumont (@ardumont) on 2022-06-10 10:35:15 +0200) * Upstream changes: - v2.0.1 - upgrades/134: Add missing index creation -- Software Heritage autobuilder (on jenkins-debian1) Fri, 10 Jun 2022 09:17:44 +0000 swh-indexer (2.0.0-1~swh1) unstable-swh; urgency=medium * New upstream release 2.0.0 - (tagged by Antoine R. Dumont (@ardumont) on 2022-06-03 15:40:32 +0200) * Upstream changes: - v2.0.0 - Set current_version attribute to postgresql datastore - Add support for indexing from head releases - Replace RevisionMetadataIndexer with DirectoryMetadataIndexer - Add support for running the server with 'postgresql' storage cls - tests: Shorten definition of REVISION - tests: Simplify definition of ORIGINS list - tests: use stock pytest_postgresql factory function - Rewrite origin_head.py as a normal function instead of an indexer - Convert test_origin_head from unittest to pytest -- Software Heritage autobuilder (on jenkins-debian1) Fri, 03 Jun 2022 13:59:59 +0000 swh-indexer (1.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.2.0 - (tagged by Valentin Lorentz on 2022-06-01 16:44:30 +0200) * Upstream changes: - v1.2.0 - * cli: Add support for running "all" indexers in the journal client -- Software Heritage autobuilder (on jenkins-debian1) Wed, 01 Jun 2022 15:08:39 +0000 swh-indexer (1.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.1.0 - (tagged by Valentin Lorentz on 2022-05-30 15:56:19 +0200) * Upstream changes: - v1.1.0 - * Add support for indexing directly from the journal client - * cff: Do not change yaml.SafeLoader globally - * add missing sentry captures - * Change misleading documentation in swh-indexer/cli.py - * test and typing maintenance -- Software Heritage autobuilder (on jenkins-debian1) Mon, 30 May 2022 14:03:54 +0000 swh-indexer (1.0.0-1~swh1) unstable-swh; urgency=medium * New upstream release 1.0.0 - (tagged by David Douard on 2022-02-24 17:35:56 +0100) * Upstream changes: - v1.0.0 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 24 Feb 2022 16:42:39 +0000 swh-indexer (0.8.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.8.2 - (tagged by Valentin Lorentz on 2022-01-12 13:53:22 +0100) * Upstream changes: - v0.8.2 - * tests: Use TimestampWithTimezone.from_datetime() instead of the constructor - * docs: Use reference instead of absolute link -- Software Heritage autobuilder (on jenkins-debian1) Wed, 12 Jan 2022 12:56:56 +0000 swh-indexer (0.8.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.8.1 - (tagged by Vincent SELLIER on 2021-12-21 16:23:37 +0100) * Upstream changes: - v0.8.1 - Changelog: - tag frozendict version to avoid segfaults on the ci -- Software Heritage autobuilder (on jenkins-debian1) Tue, 21 Dec 2021 15:28:27 +0000 swh-indexer (0.8.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.8.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-05-28 16:57:47 +0200) * Upstream changes: - v0.8.0 - metadata_dictionary: Add mapping for CITATION.cff - metadata/maven: Ignore ill-formed xml instead of failing - metadata: Fix UnboundLocalError in edge case - data/codemeta: sync with official codemeta repo - Fix SingleFileMapping case sensitivity - Use swh.core 0.14 - tox: Add sphinx environments to check sane doc build -- Software Heritage autobuilder (on jenkins-debian1) Fri, 28 May 2021 15:05:39 +0000 swh-indexer (0.7.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.7.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-03 14:10:16 +0100) * Upstream changes: - v0.7.0 - Adapt origin_get_latest_visit_status according to latest api change -- Software Heritage autobuilder (on jenkins-debian1) Wed, 03 Feb 2021 13:15:37 +0000 swh-indexer (0.6.4-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.4 - (tagged by Antoine R. Dumont (@ardumont) on 2021-02-01 15:06:04 +0100) * Upstream changes: - v0.6.4 - indexer: Remove pagination logic using stream_results() instead. - ContentPartitionIndexer: Do not index the same content multiple times at once. - Add a cli section in the doc - test_journal_client_cli: Send production objects to journal - test_journal_client: Migrate away from mocks - tests: Use production backends within the indexer tests -- Software Heritage autobuilder (on jenkins-debian1) Mon, 01 Feb 2021 14:10:18 +0000 swh-indexer (0.6.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.3 - (tagged by Antoine R. Dumont (@ardumont) on 2020-11-27 14:42:30 +0100) * Upstream changes: - v0.6.3 - storage.writer: Fix journal writer sanitizer function -- Software Heritage autobuilder (on jenkins-debian1) Fri, 27 Nov 2020 13:46:03 +0000 swh-indexer (0.6.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.2 - (tagged by Antoine R. Dumont (@ardumont) on 2020-11-27 13:55:53 +0100) * Upstream changes: - v0.6.2 - BaseRow.unique_key: Don't crash when indexer_configuration_id is None. - idx.storage.JournalWriter: pass value_sanitizer to get_journal_writer. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 27 Nov 2020 13:00:28 +0000 swh-indexer (0.6.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-11-27 10:43:14 +0100) * Upstream changes: - v0.6.1 - Fix test within the debian package builds - refactor tests to pytest -- Software Heritage autobuilder (on jenkins-debian1) Fri, 27 Nov 2020 09:49:35 +0000 swh-indexer (0.6.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-11-26 17:08:03 +0100) * Upstream changes: - v0.6.0 - indexer.journal_client: Subscribe to OriginVisitStatus topic - swh.indexer.cli.journal_client: ensure the minimal configuration exists - Drop all deprecated uses of `args` in component factories - Drop vcversioner from requirements - Make the indexer storage write to the journal. -- Software Heritage autobuilder (on jenkins-debian1) Thu, 26 Nov 2020 16:39:45 +0000 swh-indexer (0.5.0-2~swh1) unstable-swh; urgency=medium * Move distutils package from python3-swh.indexer to python3-swh.indexer.storage. -- Nicolas Dandrimont Wed, 18 Nov 2020 20:04:23 +0100 swh-indexer (0.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.0 - (tagged by Valentin Lorentz on 2020-11-06 15:25:04 +0100) * Upstream changes: - v0.5.0 - * Remove metadata deletion endpoints and algorithms - * Remove conflict_update/policy_update option from BaseIndexer.run() - * Remove conflict_update option from _add() endpoints. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 06 Nov 2020 14:28:05 +0000 swh-indexer (0.4.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.4.2 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-30 17:22:22 +0100) * Upstream changes: - v0.4.2 - tests.conftest: Fix the indexer scheduler initialization - indexer.cli: Fix missing retries_left parameter - Rename sql files according to new conventions -- Software Heritage autobuilder (on jenkins-debian1) Fri, 30 Oct 2020 16:24:14 +0000 swh-indexer (0.4.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.4.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-16 10:48:51 +0200) * Upstream changes: - v0.4.1 - test_cli: Remove unneeded config args parameter - api.server: Align configuration structure with clients configuration - storage.api.server: Add types to module and refactor tests -- Software Heritage autobuilder (on jenkins-debian1) Fri, 16 Oct 2020 08:59:09 +0000 swh-indexer (0.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.4.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-15 18:17:59 +0200) * Upstream changes: - v0.4.0 - swh.indexer.storage: Unify get_indexer_storage function with others -- Software Heritage autobuilder (on jenkins-debian1) Thu, 15 Oct 2020 16:19:01 +0000 swh-indexer (0.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.0 - (tagged by Valentin Lorentz on 2020-10-08 13:33:02 +0200) * Upstream changes: - v0.3.0 - * Make indexer-storage endpoints use attr-based classes instead of dicts - * Add more typing to indexers and their tests -- Software Heritage autobuilder (on jenkins-debian1) Thu, 08 Oct 2020 11:35:50 +0000 swh-indexer (0.2.4-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.4 - (tagged by David Douard on 2020-09-25 12:49:04 +0200) * Upstream changes: - v0.2.4 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 25 Sep 2020 10:51:28 +0000 swh-indexer (0.2.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.3 - (tagged by David Douard on 2020-09-11 15:12:01 +0200) * Upstream changes: - v0.2.3 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 11 Sep 2020 13:15:41 +0000 swh-indexer (0.2.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.2 - (tagged by Antoine R. Dumont (@ardumont) on 2020-09-04 13:21:19 +0200) * Upstream changes: - v0.2.2 - metadata: Adapt to latest storage revision_get change - Tell pytest not to recurse in dotdirs. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 04 Sep 2020 11:33:41 +0000 swh-indexer (0.2.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.1 - (tagged by Valentin Lorentz on 2020-08-20 12:59:53 +0200) * Upstream changes: - v0.2.1 - * indexer.rehash: Adapt content_get_metadata call to content_get - * origin_head: Use snapshot_get_all_branches instead of snapshot_get. - * Import SortedList, db_transaction_generator, and db_transaction from swh- core instead of swh-storage. - * tests: remove invalid assertion -- Software Heritage autobuilder (on jenkins-debian1) Thu, 20 Aug 2020 11:03:58 +0000 swh-indexer (0.2.0-1~swh2) unstable-swh; urgency=medium * Bump dependencies -- Antoine R. Dumont (@ardumont) Wed, 06 Aug 2020 13:28:00 +0200 swh-indexer (0.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-08-06 15:12:44 +0200) * Upstream changes: - v0.2.0 - Make content indexer work on partition of ids -- Software Heritage autobuilder (on jenkins-debian1) Thu, 06 Aug 2020 13:14:35 +0000 swh-indexer (0.1.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-28 12:42:19 +0200) * Upstream changes: - v0.1.1 - setup.py: Migrate from vcversioner to setuptools-scm - MANIFEST: Include missing conftest.py requirement - metadata: Update swh.storage.origin_get call to latest api change - Drop unsupported "validate" proxy - tests: Drop deprecated storage.origin_add_one use - Drop useless use of pifpaf - Clean up the swh.scheduler and swh.storage pytest plugin imports - tests: Drop obsolete origin visit fields -- Software Heritage autobuilder (on jenkins-debian1) Tue, 28 Jul 2020 10:44:54 +0000 swh-indexer (0.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-06-23 15:44:15 +0200) * Upstream changes: - v0.1.0 - origin_head: Retrieve snapshot out of the last visit status - Fix tests according to latest internal api changes -- Software Heritage autobuilder (on jenkins-debian1) Tue, 23 Jun 2020 13:46:23 +0000 swh-indexer (0.0.171-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.171 - (tagged by Antoine R. Dumont (@ardumont) on 2020-04-23 16:46:52 +0200) * Upstream changes: - v0.0.171 - cli: Adapt journal client instantiation according to latest change - codemeta: Add compatibility with PyLD >= 2.0.0. - setup: Update the minimum required runtime python3 version - Add a pyproject.toml file to target py37 for black - Enable black - test: make test data properly typed - indexer.cli.journal_client: Simplify the journal client call - Remove type from origin_add calls - Rename --max-messages to --stop-after-objects. - tests: Migrate to latest swh-storage api change -- Software Heritage autobuilder (on jenkins-debian1) Thu, 23 Apr 2020 14:49:17 +0000 swh-indexer (0.0.170-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.170 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-08 09:57:39 +0100) * Upstream changes: - v0.0.170 - indexer.metadata: Make compatible old task format -- Software Heritage autobuilder (on jenkins-debian1) Sun, 08 Mar 2020 09:03:59 +0000 swh-indexer (0.0.169-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.169 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-06 15:19:21 +0100) * Upstream changes: - v0.0.169 - storage: Add @timed metrics on remaining indexer storage endpoints - indexer.storage: Use the correct metrics module - idx.storage: Add time and counter metric to idx_configuration_add - indexer.storage: Remove redundant calls to send_metric - indexer: Fix mypy issues -- Software Heritage autobuilder (on jenkins-debian1) Fri, 06 Mar 2020 14:24:50 +0000 swh-indexer (0.0.168-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.168 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-05 15:48:32 +0100) * Upstream changes: - v0.0.168 - mimetype: Make the parsing more resilient - storage.fossology_license_add: Fix one insert query too many - tests: Migrate some tests to pytest -- Software Heritage autobuilder (on jenkins-debian1) Thu, 05 Mar 2020 14:52:27 +0000 swh-indexer (0.0.167-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.167 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-04 16:33:20 +0100) * Upstream changes: - v0.0.167 - indexer (revision, origin): Fix indexer summary to output a status -- Software Heritage autobuilder (on jenkins-debian1) Wed, 04 Mar 2020 15:37:59 +0000 swh-indexer (0.0.166-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.166 - (tagged by Valentin Lorentz on 2020-03-04 15:46:37 +0100) * Upstream changes: - v0.0.166 - * Fix merging documents with @list elements. -- Software Heritage autobuilder (on jenkins-debian1) Wed, 04 Mar 2020 14:50:54 +0000 swh-indexer (0.0.165-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.165 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-04 15:29:52 +0100) * Upstream changes: - v0.0.165 - indexers: Fix summary computation for range indexers - tests: Use assertEqual instead of deprecated assertEquals -- Software Heritage autobuilder (on jenkins-debian1) Wed, 04 Mar 2020 14:33:09 +0000 swh-indexer (0.0.164-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.164 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-04 13:52:15 +0100) * Upstream changes: - v0.0.164 - range-indexers: Fix hard- coded summary key value - indexers: Improve persist_index_computations type - indexer.metadata: Fix wrong update -- Software Heritage autobuilder (on jenkins-debian1) Wed, 04 Mar 2020 13:00:18 +0000 swh-indexer (0.0.163-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.163 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-04 11:26:56 +0100) * Upstream changes: - v0.0.163 - Make indexers return a summary of their actions - swh.indexer.storage: Add metrics to add/del endpoints - indexer.storage: Make add/del endpoints sum up added objects count - indexer: Remove unused next_step pattern -- Software Heritage autobuilder (on jenkins-debian1) Wed, 04 Mar 2020 10:31:03 +0000 swh-indexer (0.0.162-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.162 - (tagged by Antoine R. Dumont (@ardumont) on 2020-02-27 11:01:29 +0100) * Upstream changes: - v0.0.162 - fossology_license: Improve add query endpoint - pgstorage: Empty temp tables instead of dropping them - indexer.metadata: Fix edge case on unknown origin -- Software Heritage autobuilder (on jenkins-debian1) Thu, 27 Feb 2020 10:09:36 +0000 swh-indexer (0.0.161-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.161 - (tagged by Antoine R. Dumont (@ardumont) on 2020-02-25 12:07:39 +0100) * Upstream changes: - v0.0.161 - sql/128: Add content_mimetype index - storage.db: Improve content range queries to actually finish - Add a new IndexerStorageArgumentException class, for exceptions caused by the client. - Use swh-storage validation proxy. - Fix type errors with hypothesis 5.5 - Add type annotations to indexer classes -- Software Heritage autobuilder (on jenkins-debian1) Tue, 25 Feb 2020 11:20:51 +0000 swh-indexer (0.0.160-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.160 - (tagged by Antoine R. Dumont (@ardumont) on 2020-02-05 18:13:16 +0100) * Upstream changes: - v0.0.160 - Fix missing import -- Software Heritage autobuilder (on jenkins-debian1) Wed, 05 Feb 2020 17:28:18 +0000 swh-indexer (0.0.159-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.159 - (tagged by Antoine R. Dumont (@ardumont) on 2020-02-05 16:01:03 +0100) * Upstream changes: - v0.0.159 - Monkey-patch backend classes instead of 'get_storage' functions. - Fix DeprecationWarning about get_storage args. - Move IndexerStorage documentation and endpoint paths to a new IndexerStorageInterface class. - conftest: Use module's `get_` to instantiate backend - docs: Fix sphinx warnings - Fix merge_documents to work with input document with an @id. - Fix support of VCSs whose HEAD branch is an alias. - Fix type of 'author' in gemspec mapping output. - Fix test_origin_metadata mistakenly broken by e50660efca - Fix several typos reported by pre-commit hooks - Add a pre-commit config file - Remove unused property-based test environment - Migrate tox.ini to extras = xxx instead of deps = .[testing] - Merge tox test environments - Drop version constraint on pytest - Include all requirements in MANIFEST.in -- Software Heritage autobuilder (on jenkins-debian1) Wed, 05 Feb 2020 15:09:42 +0000 swh-indexer (0.0.158-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.158 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-20 10:26:59 +0100) * Upstream changes: - v0.0.158 - Re-enable tests for the in- memory storage. - Truncate result list instead of doing a copy. - journal client: add support for new origin_visit schema. - Fix alter table rename column syntax on 126->127 upgrade script -- Software Heritage autobuilder (on jenkins-debian1) Wed, 20 Nov 2019 09:30:37 +0000 swh-indexer (0.0.157-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.157 - (tagged by Valentin Lorentz on 2019-11-08 16:33:36 +0100) * Upstream changes: - v0.0.157 - * migrate storage tests to pytest - * proper pagination for IndexerStorage.origin_intrinsic_metadata_search_by_producer -- Software Heritage autobuilder (on jenkins-debian1) Fri, 08 Nov 2019 15:36:48 +0000 swh-indexer (0.0.156-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.156 - (tagged by Stefano Zacchiroli on 2019-11-05 17:36:11 +0100) * Upstream changes: - v0.0.156 - * update indexer for storage 0.0.156 - * cli: fix max-message handling in the journal-client command - * tests: fix test_metadata.py for frozen entities in swh.model.model - * tests: update tests for storage>=0.0.155 - * test_metadata typing: use type-specific mappings instead of cast - * storage/db.py: drop unused format arg regconfig from query - * typing: minimal changes to make a no-op mypy run pass -- Software Heritage autobuilder (on jenkins-debian1) Tue, 05 Nov 2019 16:45:10 +0000 swh-indexer (0.0.155-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.155 - (tagged by Valentin Lorentz on 2019-10-15 14:51:28 +0200) * Upstream changes: - v0.0.155 - * Avoid spamming logs with processed %d messages every message - * tox.ini: Fix py3 environment to use packaged tests - * Remove indirection swh.indexer.storage.api.wsgi to start server - * Add a command- line tool to run metadata translation. -- Software Heritage autobuilder (on jenkins-debian1) Tue, 15 Oct 2019 12:55:33 +0000 swh-indexer (0.0.154-1~swh2) unstable-swh; urgency=medium * Force pg_ctl path -- Nicolas Dandrimont Mon, 07 Oct 2019 16:42:08 +0200 swh-indexer (0.0.154-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.154 - (tagged by Nicolas Dandrimont on 2019-10-07 16:34:20 +0200) * Upstream changes: - Release swh.indexer v0.0.154 - Remove old scheduler compat code - Clean up CLI aliases - Port to python-magic instead of file_magic -- Software Heritage autobuilder (on jenkins-debian1) Mon, 07 Oct 2019 14:38:47 +0000 swh-indexer (0.0.153-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.153 - (tagged by Antoine R. Dumont (@ardumont) on 2019-09-11 11:46:41 +0200) * Upstream changes: - v0.0.153 - indexer-storage: Send smaller batches to origin_get - Update origin_url/from_revision/metadata_tsvector when conflict_update=True - Remove concept of 'minimal set' of metadata - npm: Fix crash on invalid 'author' field - api/client: use RPCClient instead of deprecated SWHRemoteAPI - api/server: use RPCServerApp instead of deprecated SWHServerAPIApp - tests/utils: Fix various test data model issues failing validation -- Software Heritage autobuilder (on jenkins-debian1) Wed, 11 Sep 2019 09:50:58 +0000 swh-indexer (0.0.152-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.152 - (tagged by Valentin Lorentz on 2019-07-19 11:15:41 +0200) * Upstream changes: - Send smaller batches to revision_get -- Software Heritage autobuilder (on jenkins-debian1) Fri, 19 Jul 2019 09:20:34 +0000 swh-indexer (0.0.151-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.151 - (tagged by Valentin Lorentz on 2019-07-03 17:58:32 +0200) * Upstream changes: - v0.0.151 - Fix key names in the journal client; it crashed in prod. -- Software Heritage autobuilder (on jenkins-debian1) Wed, 03 Jul 2019 16:03:07 +0000 swh-indexer (0.0.150-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.150 - (tagged by Antoine R. Dumont (@ardumont) on 2019-07-03 12:09:43 +0200) * Upstream changes: - v0.0.150 - indexer.cli: Drop unused extra alias `--consumer-id` flag -- Software Heritage autobuilder (on jenkins-debian1) Wed, 03 Jul 2019 10:20:46 +0000 swh-indexer (0.0.149-1~swh2) unstable-swh; urgency=medium * No-change: Bump dependency version -- Antoine R. Dumont (@ardumont) Wed, 03 Jul 2019 10:44:12 +0200 swh-indexer (0.0.149-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.149 - (tagged by Antoine R. Dumont (@ardumont) on 2019-07-02 18:11:12 +0200) * Upstream changes: - v0.0.149 - swh.indexer.cli: Fix get_journal_client api call - sql/upgrades/125: Fix migration script -- Software Heritage autobuilder (on jenkins-debian1) Tue, 02 Jul 2019 16:26:50 +0000 swh-indexer (0.0.148-1~swh3) unstable-swh; urgency=medium * Upstream release 0.0.148: Update version dependency -- Antoine Romain Dumont (@ardumont) Mon, 01 Jul 2019 01:50:29 +0100 swh-indexer (0.0.148-1~swh2) unstable-swh; urgency=medium * Upstream release 0.0.148 -- Antoine Romain Dumont (@ardumont) Mon, 01 Jul 2019 01:50:29 +0100 swh-indexer (0.0.148-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.148 - (tagged by Antoine R. Dumont (@ardumont) on 2019-07-01 12:21:32 +0200) * Upstream changes: - v0.0.148 - Manipulate origin URLs instead of origin ids - journal: create tasks for multiple origins - Tests: Improvements -- Software Heritage autobuilder (on jenkins-debian1) Mon, 01 Jul 2019 10:34:26 +0000 swh-indexer (0.0.147-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.147 - (tagged by Antoine Lambert on 2019-05-23 11:03:02 +0200) * Upstream changes: - version 0.0.147 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 23 May 2019 09:11:05 +0000 swh-indexer (0.0.146-1~swh2) unstable-swh; urgency=medium * Remove hypothesis directory -- Nicolas Dandrimont Thu, 18 Apr 2019 18:29:09 +0200 swh-indexer (0.0.146-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.146 - (tagged by Valentin Lorentz on 2019-04-11 11:08:29 +0200) * Upstream changes: - Better explain what the 'string fields' are. -- Software Heritage autobuilder (on jenkins-debian1) Thu, 11 Apr 2019 09:47:24 +0000 swh-indexer (0.0.145-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.145 - (tagged by Valentin Lorentz on 2019-03-15 11:18:25 +0100) * Upstream changes: - Add support for keywords in PKG-INFO. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 15 Mar 2019 11:34:53 +0000 swh-indexer (0.0.144-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.144 - (tagged by Thibault Allançon on 2019-03-07 08:16:49 +0100) * Upstream changes: - Fix heterogeneity of names in metadata tables -- Software Heritage autobuilder (on jenkins-debian1) Thu, 14 Mar 2019 13:30:44 +0000 swh-indexer (0.0.143-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.143 - (tagged by Thibault Allançon on 2019-03-12 10:18:37 +0100) * Upstream changes: - Use hashutil.MultiHash in swh.indexer.tests.test_utils.fill_storage - Summary: Closes T1448 - Reviewers: #reviewers - Subscribers: swh-public-ci - Maniphest Tasks: T1448 - Differential Revision: https://forge.softwareheritage.org/D1235 -- Software Heritage autobuilder (on jenkins-debian1) Wed, 13 Mar 2019 10:24:37 +0000 swh-indexer (0.0.142-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.142 - (tagged by Valentin Lorentz on 2019-03-01 14:19:05 +0100) * Upstream changes: - Skip useless requests. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 01 Mar 2019 13:26:06 +0000 swh-indexer (0.0.141-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.141 - (tagged by Valentin Lorentz on 2019-03-01 10:59:54 +0100) * Upstream changes: - Prevent origin metadata indexer from writing empty records -- Software Heritage autobuilder (on jenkins-debian1) Fri, 01 Mar 2019 10:10:56 +0000 swh-indexer (0.0.140-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.140 - (tagged by Valentin Lorentz on 2019-02-25 10:38:52 +0100) * Upstream changes: - Drop the 'context' and 'type' config of metadata indexers. - They are both ignored already. -- Software Heritage autobuilder (on jenkins-debian1) Mon, 25 Feb 2019 10:40:10 +0000 swh-indexer (0.0.139-1~swh2) unstable-swh; urgency=low * New release fixing debian build -- Antoine Romain Dumont (@ardumont) Fri, 22 Feb 2019 16:27:47 +0100 swh-indexer (0.0.139-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.139 - (tagged by Antoine R. Dumont (@ardumont) on 2019-02-22 15:53:22 +0100) * Upstream changes: - v0.0.139 - Clean up no longer used tasks -- Software Heritage autobuilder (on jenkins-debian1) Fri, 22 Feb 2019 14:59:40 +0000 swh-indexer (0.0.138-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.138 - (tagged by Valentin Lorentz on 2019-02-22 15:30:30 +0100) * Upstream changes: - Make the 'config' argument of OriginMetadaIndexer optional again. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 22 Feb 2019 14:37:35 +0000 swh-indexer (0.0.137-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.137 - (tagged by Antoine R. Dumont (@ardumont) on 2019-02-22 10:59:53 +0100) * Upstream changes: - v0.0.137 - swh.indexer.storage.api.wsgi: Open production wsgi entrypoint - swh.indexer.cli: Move dev app entrypoint in dedicated cli - indexer.storage: Make server load explicit configuration and check - config: use already loaded swh config, if any, when instantiating an Indexer - api: Add support for filtering by tool_id to origin_intrinsic_metadata_search_by_producer. - api: Add storage endpoint to search metadata by mapping. - runtime: Remove implicit configuration from the metadata indexers. - debian: Remove debian packaging from master branch - docs: Update missing documentation -- Software Heritage autobuilder (on jenkins-debian1) Fri, 22 Feb 2019 10:11:29 +0000 swh-indexer (0.0.136-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.136 - (tagged by Valentin Lorentz on 2019-02-14 17:09:00 +0100) * Upstream changes: - Don't send 'None' as a revision id to storage.revision_get. - This error wasn't caught before because the in-mem storage - accepts None values, but the pg storage doesn't. -- Software Heritage autobuilder (on jenkins-debian1) Thu, 14 Feb 2019 16:22:41 +0000 swh-indexer (0.0.135-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.135 - (tagged by Valentin Lorentz on 2019-02-14 14:45:24 +0100) * Upstream changes: - Fix deduplication of origins when persisting origin intrinsic metadata. -- Software Heritage autobuilder (on jenkins-debian1) Thu, 14 Feb 2019 14:32:55 +0000 swh-indexer (0.0.134-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.134 - (tagged by Antoine R. Dumont (@ardumont) on 2019-02-13 23:46:44 +0100) * Upstream changes: - v0.0.134 - package: Break dependency of swh.indexer.storage on swh.indexer. - api/server: Do not read configuration at each request - metadata: Fix gemspec test - metadata: Prevent OriginMetadataIndexer from sending duplicate - revisions to revision_metadata_add. - test: Fix bugs found by hypothesis. - test: Use hypothesis to generate adversarial inputs. - Add more type checks in metadata dictionary. - Add checks in the idx_storage that the same content/rev/orig is not - present twice in the new data. -- Software Heritage autobuilder (on jenkins-debian1) Thu, 14 Feb 2019 09:16:15 +0000 swh-indexer (0.0.133-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.133 - (tagged by Antoine R. Dumont (@ardumont) on 2019-02-12 10:28:01 +0100) * Upstream changes: - v0.0.133 - Migrate BaseDB api calls from core to storage - Improve storage api calls using latest storage api - OriginIndexer: Refactoring - tests: Refactoring - metadata search: Use index - indexer metadata: Provide stats per origin - indexer metadata: Update mapping column - indexer metadata: Improve and fix issues -- Software Heritage autobuilder (on jenkins-debian1) Tue, 12 Feb 2019 09:34:43 +0000 swh-indexer (0.0.132-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.132 - (tagged by Antoine R. Dumont (@ardumont) on 2019-01-30 15:03:14 +0100) * Upstream changes: - v0.0.132 - swh/indexer/tasks: Fix range indexer tasks - Maven: Add support for empty XML nodes. - Add support for alternative call format for Gem::Specification.new. -- Software Heritage autobuilder (on jenkins-debian1) Wed, 30 Jan 2019 14:09:48 +0000 swh-indexer (0.0.131-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.131 - (tagged by Antoine R. Dumont (@ardumont) on 2019-01-30 10:56:43 +0100) * Upstream changes: - v0.0.131 - fix pep8 violations - fix misspellings -- Software Heritage autobuilder (on jenkins-debian1) Wed, 30 Jan 2019 10:01:47 +0000 swh-indexer (0.0.129-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.129 - (tagged by Valentin Lorentz on 2019-01-29 14:11:22 +0100) * Upstream changes: - Fix missing config file name change. -- Software Heritage autobuilder (on jenkins-debian1) Tue, 29 Jan 2019 13:34:17 +0000 swh-indexer (0.0.128-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.128 - (tagged by Valentin Lorentz on 2019-01-25 15:22:52 +0100) * Upstream changes: - Make metadata indexers store the mappings used to translate metadata. -- Software Heritage autobuilder (on jenkins-debian1) Tue, 29 Jan 2019 12:18:16 +0000 swh-indexer (0.0.127-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.127 - (tagged by Valentin Lorentz on 2019-01-15 15:56:49 +0100) * Upstream changes: - Prevent repository normalization from crashing on malformed input. -- Software Heritage autobuilder (on jenkins-debian1) Tue, 15 Jan 2019 16:20:32 +0000 swh-indexer (0.0.126-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.126 - (tagged by Valentin Lorentz on 2019-01-14 11:42:52 +0100) * Upstream changes: - Don't call OriginHeadIndexer.next_step when there is no revision. -- Software Heritage autobuilder (on jenkins-debian1) Mon, 14 Jan 2019 10:57:34 +0000 swh-indexer (0.0.125-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.125 - (tagged by Antoine R. Dumont (@ardumont) on 2019-01-11 12:01:42 +0100) * Upstream changes: - v0.0.125 - Add journal client that listens for origin visits and schedules - OriginHead - Fix tests to work with the new version of swh.storage -- Software Heritage autobuilder (on jenkins-debian1) Fri, 11 Jan 2019 11:08:51 +0000 swh-indexer (0.0.124-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.124 - (tagged by Antoine R. Dumont (@ardumont) on 2019-01-08 14:09:32 +0100) * Upstream changes: - v0.0.124 - indexer: Fix type check on indexing result -- Software Heritage autobuilder (on jenkins-debian1) Thu, 10 Jan 2019 17:12:07 +0000 swh-indexer (0.0.118-1~swh1) unstable-swh; urgency=medium * v0.0.118 * metadata-indexer: Fix setup initialization * tests: Refactoring -- Antoine R. Dumont (@ardumont) Fri, 30 Nov 2018 14:50:52 +0100 swh-indexer (0.0.67-1~swh1) unstable-swh; urgency=medium * v0.0.67 * mimetype: Migrate to indexed data as text -- Antoine R. Dumont (@ardumont) Wed, 28 Nov 2018 11:35:37 +0100 swh-indexer (0.0.66-1~swh1) unstable-swh; urgency=medium * v0.0.66 * range-indexer: Stream indexing range computations -- Antoine R. Dumont (@ardumont) Tue, 27 Nov 2018 11:48:24 +0100 swh-indexer (0.0.65-1~swh1) unstable-swh; urgency=medium * v0.0.65 * Fix revision metadata indexer -- Antoine R. Dumont (@ardumont) Mon, 26 Nov 2018 19:30:48 +0100 swh-indexer (0.0.64-1~swh1) unstable-swh; urgency=medium * v0.0.64 * indexer: Fix mixed identifier encodings issues * Add missing config filename for origin intrinsic metadata indexer. -- Antoine R. Dumont (@ardumont) Mon, 26 Nov 2018 12:20:01 +0100 swh-indexer (0.0.63-1~swh1) unstable-swh; urgency=medium * v0.0.63 * Make the OriginMetadataIndexer fetch rev metadata from the storage * instead of getting them via the scheduler. * Make the 'result_name' key of 'next_step' optional. * Add missing return. * doc: update index to match new swh-doc format -- Antoine R. Dumont (@ardumont) Fri, 23 Nov 2018 17:56:10 +0100 swh-indexer (0.0.62-1~swh1) unstable-swh; urgency=medium * v0.0.62 * metadata indexer: Add empty tool configuration * Add fulltext search on origin intrinsic metadata -- Antoine R. Dumont (@ardumont) Fri, 23 Nov 2018 14:25:55 +0100 swh-indexer (0.0.61-1~swh1) unstable-swh; urgency=medium * v0.0.61 * indexer: Fix origin indexer's default arguments -- Antoine R. Dumont (@ardumont) Wed, 21 Nov 2018 16:01:50 +0100 swh-indexer (0.0.60-1~swh1) unstable-swh; urgency=medium * v0.0.60 * origin_head: Make next step optional * tests: Increase coverage -- Antoine R. Dumont (@ardumont) Wed, 21 Nov 2018 12:33:13 +0100 swh-indexer (0.0.59-1~swh1) unstable-swh; urgency=medium * v0.0.59 * fossology license: Fix issue on license computation * Improve docstrings * Fix pep8 violations * Increase coverage on content indexers -- Antoine R. Dumont (@ardumont) Tue, 20 Nov 2018 14:27:20 +0100 swh-indexer (0.0.58-1~swh1) unstable-swh; urgency=medium * v0.0.58 * Add missing default configuration for fossology license indexer * tests: Remove dead code -- Antoine R. Dumont (@ardumont) Tue, 20 Nov 2018 12:06:56 +0100 swh-indexer (0.0.57-1~swh1) unstable-swh; urgency=medium * v0.0.57 * storage: Open new endpoint on fossology license range retrieval * indexer: Open new fossology license range indexer -- Antoine R. Dumont (@ardumont) Tue, 20 Nov 2018 11:44:57 +0100 swh-indexer (0.0.56-1~swh1) unstable-swh; urgency=medium * v0.0.56 * storage.api: Open new endpoints (mimetype range, fossology range) * content indexers: Open mimetype and fossology range indexers * Remove orchestrator modules * tests: Improve coverage -- Antoine R. Dumont (@ardumont) Mon, 19 Nov 2018 11:56:06 +0100 swh-indexer (0.0.55-1~swh1) unstable-swh; urgency=medium * v0.0.55 * swh.indexer: Let task reschedule itself through the scheduler * Use swh.scheduler instead of celery leaking all around * swh.indexer.orchestrator: Fix orchestrator initialization step * swh.indexer.tasks: Fix type error when no result or list result -- Antoine R. Dumont (@ardumont) Mon, 29 Oct 2018 10:41:54 +0100 swh-indexer (0.0.54-1~swh1) unstable-swh; urgency=medium * v0.0.54 * swh.indexer.tasks: Fix task to use the scheduler's -- Antoine R. Dumont (@ardumont) Thu, 25 Oct 2018 20:13:51 +0200 swh-indexer (0.0.53-1~swh1) unstable-swh; urgency=medium * v0.0.53 * swh.indexer.rehash: Migrate to latest swh.model.hashutil.MultiHash * indexer: Add the origin intrinsic metadata indexer * indexer: Add OriginIndexer and OriginHeadIndexer. * indexer.storage: Add the origin intrinsic metadata storage database * indexer.storage: Autogenerate the Indexer Storage HTTP API. * setup: prepare for pypi upload * tests: Add a tox file * tests: migrate to pytest * tests: Add tests around celery stack * docs: Improve documentation and reuse README in generated documentation -- Antoine R. Dumont (@ardumont) Thu, 25 Oct 2018 19:03:56 +0200 swh-indexer (0.0.52-1~swh1) unstable-swh; urgency=medium * v0.0.52 * swh.indexer.storage: Refactor fossology license get (first external * contribution, cf. /CONTRIBUTORS) * swh.indexer.storage: Fix typo in invariable name metadata * swh.indexer.storage: No longer use temp table when reading data * swh.indexer.storage: Clean up unused import * swh.indexer.storage: Remove dead entry points origin_metadata* * swh.indexer.storage: Update docstrings information and format -- Antoine R. Dumont (@ardumont) Wed, 13 Jun 2018 11:20:40 +0200 swh-indexer (0.0.51-1~swh1) unstable-swh; urgency=medium * Release swh.indexer v0.0.51 * Update for new db_transaction{,_generator} -- Nicolas Dandrimont Tue, 05 Jun 2018 14:10:39 +0200 swh-indexer (0.0.50-1~swh1) unstable-swh; urgency=medium * v0.0.50 * swh.indexer.api.client: Permit to specify the query timeout option -- Antoine R. Dumont (@ardumont) Thu, 24 May 2018 12:19:06 +0200 swh-indexer (0.0.49-1~swh1) unstable-swh; urgency=medium * v0.0.49 * test_storage: Instantiate the tools during tests' setUp phase * test_storage: Deallocate storage during teardown step * test_storage: Make storage test fixture connect to postgres itself * storage.api.server: Only instantiate storage backend once per import * Use thread-aware psycopg2 connection pooling for database access -- Antoine R. Dumont (@ardumont) Mon, 14 May 2018 11:09:30 +0200 swh-indexer (0.0.48-1~swh1) unstable-swh; urgency=medium * Release swh.indexer v0.0.48 * Update for new swh.storage -- Nicolas Dandrimont Sat, 12 May 2018 18:30:10 +0200 swh-indexer (0.0.47-1~swh1) unstable-swh; urgency=medium * v0.0.47 * d/control: Fix runtime typo in packaging dependency -- Antoine R. Dumont (@ardumont) Thu, 07 Dec 2017 16:54:49 +0100 swh-indexer (0.0.46-1~swh1) unstable-swh; urgency=medium * v0.0.46 * Split swh-indexer packages in 2 python3-swh.indexer.storage and * python3-swh.indexer -- Antoine R. Dumont (@ardumont) Thu, 07 Dec 2017 16:18:04 +0100 swh-indexer (0.0.45-1~swh1) unstable-swh; urgency=medium * v0.0.45 * Fix usual error raised when deploying -- Antoine R. Dumont (@ardumont) Thu, 07 Dec 2017 15:01:01 +0100 swh-indexer (0.0.44-1~swh1) unstable-swh; urgency=medium * v0.0.44 * swh.indexer: Make indexer use their own storage -- Antoine R. Dumont (@ardumont) Thu, 07 Dec 2017 13:20:44 +0100 swh-indexer (0.0.43-1~swh1) unstable-swh; urgency=medium * v0.0.43 * swh.indexer.mimetype: Work around problem in detection -- Antoine R. Dumont (@ardumont) Wed, 29 Nov 2017 10:26:11 +0100 swh-indexer (0.0.42-1~swh1) unstable-swh; urgency=medium * v0.0.42 * swh.indexer: Make indexers register tools in prepare method -- Antoine R. Dumont (@ardumont) Fri, 24 Nov 2017 11:26:03 +0100 swh-indexer (0.0.41-1~swh1) unstable-swh; urgency=medium * v0.0.41 * mimetype: Use magic library api instead of parsing `file` cli output -- Antoine R. Dumont (@ardumont) Mon, 20 Nov 2017 13:05:29 +0100 swh-indexer (0.0.39-1~swh1) unstable-swh; urgency=medium * v0.0.39 * swh.indexer.producer: Fix argument to match the abstract definition -- Antoine R. Dumont (@ardumont) Thu, 19 Oct 2017 10:03:44 +0200 swh-indexer (0.0.38-1~swh1) unstable-swh; urgency=medium * v0.0.38 * swh.indexer.indexer: Fix argument to match the abstract definition -- Antoine R. Dumont (@ardumont) Wed, 18 Oct 2017 19:57:47 +0200 swh-indexer (0.0.37-1~swh1) unstable-swh; urgency=medium * v0.0.37 * swh.indexer.indexer: Fix argument to match the abstract definition -- Antoine R. Dumont (@ardumont) Wed, 18 Oct 2017 18:59:42 +0200 swh-indexer (0.0.36-1~swh1) unstable-swh; urgency=medium * v0.0.36 * packaging: Cleanup * codemeta: Adding codemeta.json file to document metadata * swh.indexer.mimetype: Fix edge case regarding empty raw content * docs: sanitize docstrings for sphinx documentation generation * swh.indexer.metadata: Add RevisionMetadataIndexer * swh.indexer.metadata: Add ContentMetadataIndexer * swh.indexer: Refactor base class to improve inheritance * swh.indexer.metadata: First draft of the metadata content indexer * for npm (package.json) * swh.indexer.tests: Added tests for language indexer -- Antoine R. Dumont (@ardumont) Wed, 18 Oct 2017 16:24:24 +0200 swh-indexer (0.0.35-1~swh1) unstable-swh; urgency=medium * Release swh.indexer 0.0.35 * Update tasks to new swh.scheduler API -- Nicolas Dandrimont Mon, 12 Jun 2017 18:02:04 +0200 swh-indexer (0.0.34-1~swh1) unstable-swh; urgency=medium * v0.0.34 * Fix unbound local error on edge case -- Antoine R. Dumont (@ardumont) Wed, 07 Jun 2017 11:23:29 +0200 swh-indexer (0.0.33-1~swh1) unstable-swh; urgency=medium * v0.0.33 * language indexer: Improve edge case policy -- Antoine R. Dumont (@ardumont) Wed, 07 Jun 2017 11:02:47 +0200 swh-indexer (0.0.32-1~swh1) unstable-swh; urgency=medium * v0.0.32 * Update fossology license to use the latest swh-storage * Improve language indexer to deal with potential error on bad * chunking -- Antoine R. Dumont (@ardumont) Tue, 06 Jun 2017 18:13:40 +0200 swh-indexer (0.0.31-1~swh1) unstable-swh; urgency=medium * v0.0.31 * Reduce log verbosity on language indexer -- Antoine R. Dumont (@ardumont) Fri, 02 Jun 2017 19:08:52 +0200 swh-indexer (0.0.30-1~swh1) unstable-swh; urgency=medium * v0.0.30 * Fix wrong default configuration -- Antoine R. Dumont (@ardumont) Fri, 02 Jun 2017 18:01:27 +0200 swh-indexer (0.0.29-1~swh1) unstable-swh; urgency=medium * v0.0.29 * Update indexer to resolve indexer configuration identifier * Adapt language indexer to use partial raw content -- Antoine R. Dumont (@ardumont) Fri, 02 Jun 2017 16:21:27 +0200 swh-indexer (0.0.28-1~swh1) unstable-swh; urgency=medium * v0.0.28 * Add error resilience to fossology indexer -- Antoine R. Dumont (@ardumont) Mon, 22 May 2017 12:57:55 +0200 swh-indexer (0.0.27-1~swh1) unstable-swh; urgency=medium * v0.0.27 * swh.indexer.language: Incremental encoding detection -- Antoine R. Dumont (@ardumont) Wed, 17 May 2017 18:04:27 +0200 swh-indexer (0.0.26-1~swh1) unstable-swh; urgency=medium * v0.0.26 * swh.indexer.orchestrator: Add batch size option per indexer * Log caught exception in a unified manner * Add rescheduling option (not by default) on rehash + indexers -- Antoine R. Dumont (@ardumont) Wed, 17 May 2017 14:08:07 +0200 swh-indexer (0.0.25-1~swh1) unstable-swh; urgency=medium * v0.0.25 * Add reschedule on error parameter for indexers -- Antoine R. Dumont (@ardumont) Fri, 12 May 2017 12:13:15 +0200 swh-indexer (0.0.24-1~swh1) unstable-swh; urgency=medium * v0.0.24 * Make rehash indexer more resilient to errors by rescheduling contents * in error (be it reading or updating problems) -- Antoine R. Dumont (@ardumont) Thu, 04 May 2017 14:22:43 +0200 swh-indexer (0.0.23-1~swh1) unstable-swh; urgency=medium * v0.0.23 * Improve producer to optionally make it synchronous -- Antoine R. Dumont (@ardumont) Wed, 03 May 2017 15:29:44 +0200 swh-indexer (0.0.22-1~swh1) unstable-swh; urgency=medium * v0.0.22 * Improve mimetype indexer implementation * Make the chaining option in the mimetype indexer -- Antoine R. Dumont (@ardumont) Tue, 02 May 2017 16:31:14 +0200 swh-indexer (0.0.21-1~swh1) unstable-swh; urgency=medium * v0.0.21 * swh.indexer.rehash: Actually make the worker log -- Antoine R. Dumont (@ardumont) Tue, 02 May 2017 14:28:55 +0200 swh-indexer (0.0.20-1~swh1) unstable-swh; urgency=medium * v0.0.20 * swh.indexer.rehash: * Improve reading from objstorage only when needed * Fix empty file use case (which was skipped) * Add logging -- Antoine R. Dumont (@ardumont) Fri, 28 Apr 2017 09:39:09 +0200 swh-indexer (0.0.19-1~swh1) unstable-swh; urgency=medium * v0.0.19 * Fix rehash indexer's default configuration file -- Antoine R. Dumont (@ardumont) Thu, 27 Apr 2017 19:17:20 +0200 swh-indexer (0.0.18-1~swh1) unstable-swh; urgency=medium * v0.0.18 * Add new rehash indexer -- Antoine R. Dumont (@ardumont) Wed, 26 Apr 2017 15:23:02 +0200 swh-indexer (0.0.17-1~swh1) unstable-swh; urgency=medium * v0.0.17 * Add information on indexer tools (T610) -- Antoine R. Dumont (@ardumont) Fri, 02 Dec 2016 18:32:54 +0100 swh-indexer (0.0.16-1~swh1) unstable-swh; urgency=medium * v0.0.16 * bug fixes -- Antoine R. Dumont (@ardumont) Tue, 15 Nov 2016 19:31:52 +0100 swh-indexer (0.0.15-1~swh1) unstable-swh; urgency=medium * v0.0.15 * Improve message producer -- Antoine R. Dumont (@ardumont) Tue, 15 Nov 2016 18:16:42 +0100 swh-indexer (0.0.14-1~swh1) unstable-swh; urgency=medium * v0.0.14 * Update package dependency on fossology-nomossa -- Antoine R. Dumont (@ardumont) Tue, 15 Nov 2016 14:13:41 +0100 swh-indexer (0.0.13-1~swh1) unstable-swh; urgency=medium * v0.0.13 * Add new license indexer * ctags indexer: align behavior with other indexers regarding the * conflict update policy -- Antoine R. Dumont (@ardumont) Mon, 14 Nov 2016 14:13:34 +0100 swh-indexer (0.0.12-1~swh1) unstable-swh; urgency=medium * v0.0.12 * Add runtime dependency on universal-ctags -- Antoine R. Dumont (@ardumont) Fri, 04 Nov 2016 13:59:59 +0100 swh-indexer (0.0.11-1~swh1) unstable-swh; urgency=medium * v0.0.11 * Remove dependency on exuberant-ctags -- Antoine R. Dumont (@ardumont) Thu, 03 Nov 2016 16:13:26 +0100 swh-indexer (0.0.10-1~swh1) unstable-swh; urgency=medium * v0.0.10 * Add ctags indexer -- Antoine R. Dumont (@ardumont) Thu, 20 Oct 2016 16:12:42 +0200 swh-indexer (0.0.9-1~swh1) unstable-swh; urgency=medium * v0.0.9 * d/control: Bump dependency to latest python3-swh.storage api * mimetype: Use the charset to filter out data * orchestrator: Separate 2 distincts orchestrators (one for all * contents, one for text contents) * mimetype: once index computed, send text contents to text orchestrator -- Antoine R. Dumont (@ardumont) Thu, 13 Oct 2016 15:28:17 +0200 swh-indexer (0.0.8-1~swh1) unstable-swh; urgency=medium * v0.0.8 * Separate configuration file per indexer (no need for language) * Rename module file_properties to mimetype consistently with other * layers -- Antoine R. Dumont (@ardumont) Sat, 08 Oct 2016 11:46:29 +0200 swh-indexer (0.0.7-1~swh1) unstable-swh; urgency=medium * v0.0.7 * Adapt indexer language and mimetype to store result in storage. * Clean up obsolete code -- Antoine R. Dumont (@ardumont) Sat, 08 Oct 2016 10:26:08 +0200 swh-indexer (0.0.6-1~swh1) unstable-swh; urgency=medium * v0.0.6 * Fix multiple issues on production -- Antoine R. Dumont (@ardumont) Fri, 30 Sep 2016 17:00:11 +0200 swh-indexer (0.0.5-1~swh1) unstable-swh; urgency=medium * v0.0.5 * Fix debian/control dependency issue -- Antoine R. Dumont (@ardumont) Fri, 30 Sep 2016 16:06:20 +0200 swh-indexer (0.0.4-1~swh1) unstable-swh; urgency=medium * v0.0.4 * Upgrade dependencies issues -- Antoine R. Dumont (@ardumont) Fri, 30 Sep 2016 16:01:52 +0200 swh-indexer (0.0.3-1~swh1) unstable-swh; urgency=medium * v0.0.3 * Add encoding detection * Use encoding to improve language detection * bypass language detection for binary files * bypass ctags for binary files or decoding failure file -- Antoine R. Dumont (@ardumont) Fri, 30 Sep 2016 12:30:11 +0200 swh-indexer (0.0.2-1~swh1) unstable-swh; urgency=medium * v0.0.2 * Provide one possible sha1's name for the multiple tools to ease * information extrapolation * Fix debian package dependency issue -- Antoine R. Dumont (@ardumont) Thu, 29 Sep 2016 21:45:44 +0200 swh-indexer (0.0.1-1~swh1) unstable-swh; urgency=medium * Initial release * v0.0.1 * First implementation on poc -- Antoine R. Dumont (@ardumont) Wed, 28 Sep 2016 23:40:13 +0200 diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index 860a729..ee4435e 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,71 +1,71 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 2.4.4 +Version: 2.5.0 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE License-File: AUTHORS swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py index f5c8889..2c7318f 100644 --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -1,397 +1,407 @@ # Copyright (C) 2019-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Callable, Dict, Iterator, List, Optional # WARNING: do not import unnecessary things here to keep cli startup time under # control import click from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup from swh.core.cli import swh as swh_cli_group @swh_cli_group.group( name="indexer", context_settings=CONTEXT_SETTINGS, cls=AliasedGroup ) @click.option( "--config-file", "-C", default=None, type=click.Path( exists=True, dir_okay=False, ), help="Configuration file.", ) @click.pass_context def indexer_cli_group(ctx, config_file): """Software Heritage Indexer tools. The Indexer is used to mine the content of the archive and extract derived information from archive source code artifacts. """ from swh.core import config ctx.ensure_object(dict) conf = config.read(config_file) ctx.obj["config"] = conf def _get_api(getter, config, config_key, url): if url: config[config_key] = {"cls": "remote", "url": url} elif config_key not in config: raise click.ClickException("Missing configuration for {}".format(config_key)) return getter(**config[config_key]) @indexer_cli_group.group("mapping") def mapping(): """Manage Software Heritage Indexer mappings.""" pass @mapping.command("list") def mapping_list(): """Prints the list of known mappings.""" from swh.indexer import metadata_dictionary mapping_names = [mapping.name for mapping in metadata_dictionary.MAPPINGS.values()] mapping_names.sort() for mapping_name in mapping_names: click.echo(mapping_name) @mapping.command("list-terms") @click.option( "--exclude-mapping", multiple=True, help="Exclude the given mapping from the output" ) @click.option( "--concise", is_flag=True, default=False, help="Don't print the list of mappings supporting each term.", ) def mapping_list_terms(concise, exclude_mapping): """Prints the list of known CodeMeta terms, and which mappings support them.""" from swh.indexer import metadata_dictionary properties = metadata_dictionary.list_terms() for (property_name, supported_mappings) in sorted(properties.items()): supported_mappings = {m.name for m in supported_mappings} supported_mappings -= set(exclude_mapping) if supported_mappings: if concise: click.echo(property_name) else: click.echo("{}:".format(property_name)) click.echo("\t" + ", ".join(sorted(supported_mappings))) @mapping.command("translate") @click.argument("mapping-name") @click.argument("file", type=click.File("rb")) def mapping_translate(mapping_name, file): """Translates file from mapping-name to codemeta format.""" import json from swh.indexer import metadata_dictionary mapping_cls = [ cls for cls in metadata_dictionary.MAPPINGS.values() if cls.name == mapping_name ] if not mapping_cls: raise click.ClickException("Unknown mapping {}".format(mapping_name)) assert len(mapping_cls) == 1 mapping_cls = mapping_cls[0] mapping = mapping_cls() codemeta_doc = mapping.translate(file.read()) click.echo(json.dumps(codemeta_doc, indent=4)) @indexer_cli_group.group("schedule") @click.option("--scheduler-url", "-s", default=None, help="URL of the scheduler API") @click.option( "--indexer-storage-url", "-i", default=None, help="URL of the indexer storage API" ) @click.option( "--storage-url", "-g", default=None, help="URL of the (graph) storage API" ) @click.option( "--dry-run/--no-dry-run", is_flag=True, default=False, help="List only what would be scheduled.", ) @click.pass_context def schedule(ctx, scheduler_url, storage_url, indexer_storage_url, dry_run): """Manipulate Software Heritage Indexer tasks. Via SWH Scheduler's API.""" from swh.indexer.storage import get_indexer_storage from swh.scheduler import get_scheduler from swh.storage import get_storage ctx.obj["indexer_storage"] = _get_api( get_indexer_storage, ctx.obj["config"], "indexer_storage", indexer_storage_url ) ctx.obj["storage"] = _get_api( get_storage, ctx.obj["config"], "storage", storage_url ) ctx.obj["scheduler"] = _get_api( get_scheduler, ctx.obj["config"], "scheduler", scheduler_url ) if dry_run: ctx.obj["scheduler"] = None def list_origins_by_producer(idx_storage, mappings, tool_ids) -> Iterator[str]: next_page_token = "" limit = 10000 while next_page_token is not None: result = idx_storage.origin_intrinsic_metadata_search_by_producer( page_token=next_page_token, limit=limit, ids_only=True, mappings=mappings or None, tool_ids=tool_ids or None, ) next_page_token = result.next_page_token yield from result.results @schedule.command("reindex_origin_metadata") @click.option( "--batch-size", "-b", "origin_batch_size", default=10, show_default=True, type=int, help="Number of origins per task", ) @click.option( "--tool-id", "-t", "tool_ids", type=int, multiple=True, help="Restrict search of old metadata to this/these tool ids.", ) @click.option( "--mapping", "-m", "mappings", multiple=True, help="Mapping(s) that should be re-scheduled (eg. 'npm', 'gemspec', 'maven')", ) @click.option( "--task-type", default="index-origin-metadata", show_default=True, help="Name of the task type to schedule.", ) @click.pass_context def schedule_origin_metadata_reindex( ctx, origin_batch_size, tool_ids, mappings, task_type ): """Schedules indexing tasks for origins that were already indexed.""" from swh.scheduler.cli_utils import schedule_origin_batches idx_storage = ctx.obj["indexer_storage"] scheduler = ctx.obj["scheduler"] origins = list_origins_by_producer(idx_storage, mappings, tool_ids) kwargs = {"retries_left": 1} schedule_origin_batches(scheduler, task_type, origins, origin_batch_size, kwargs) @indexer_cli_group.command("journal-client") @click.argument( "indexer", type=click.Choice( [ "origin_intrinsic_metadata", "extrinsic_metadata", "content_mimetype", "content_fossology_license", "*", ] ), required=False # TODO: remove required=False after we stop using it ) @click.option("--scheduler-url", "-s", default=None, help="URL of the scheduler API") @click.option( "--origin-metadata-task-type", default="index-origin-metadata", help="Name of the task running the origin metadata indexer.", ) @click.option( "--broker", "brokers", type=str, multiple=True, help="Kafka broker to connect to." ) @click.option( "--prefix", type=str, default=None, help="Prefix of Kafka topic names to read from." ) @click.option("--group-id", type=str, help="Consumer/group id for reading from Kafka.") @click.option( "--stop-after-objects", "-m", default=None, type=int, help="Maximum number of objects to replay. Default is to run forever.", ) +@click.option( + "--batch-size", + "-b", + default=None, + type=int, + help="Batch size. Default is 200.", +) @click.pass_context def journal_client( ctx, indexer: Optional[str], scheduler_url: str, origin_metadata_task_type: str, brokers: List[str], prefix: str, group_id: str, stop_after_objects: Optional[int], + batch_size: Optional[int], ): """ Listens for new objects from the SWH Journal, and either: * runs the indexer with the name passed as argument, if any * schedules tasks to run relevant indexers (currently, only origin_intrinsic_metadata) on these new objects otherwise. Passing '*' as indexer name runs all indexers. """ import functools import warnings from swh.indexer.indexer import BaseIndexer, ObjectsDict from swh.indexer.journal_client import process_journal_objects from swh.journal.client import get_journal_client from swh.scheduler import get_scheduler cfg = ctx.obj["config"] journal_cfg = cfg.get("journal", {}) scheduler = _get_api(get_scheduler, cfg, "scheduler", scheduler_url) brokers = brokers or journal_cfg.get("brokers") if not brokers: raise ValueError("The brokers configuration is mandatory.") prefix = prefix or journal_cfg.get("prefix") group_id = group_id or journal_cfg.get("group_id") origin_metadata_task_type = origin_metadata_task_type or journal_cfg.get( "origin_metadata_task_type" ) stop_after_objects = stop_after_objects or journal_cfg.get("stop_after_objects") + batch_size = batch_size or journal_cfg.get("batch_size", 200) object_types = set() worker_fns: List[Callable[[ObjectsDict], Dict]] = [] if indexer is None: warnings.warn( "'swh indexer journal-client' with no argument creates scheduler tasks " "to index, rather than index directly.", DeprecationWarning, ) object_types.add("origin_visit_status") worker_fns.append( functools.partial( process_journal_objects, scheduler=scheduler, task_names={ "origin_metadata": origin_metadata_task_type, }, ) ) idx: Optional[BaseIndexer] = None if indexer in ("origin_intrinsic_metadata", "*"): from swh.indexer.metadata import OriginMetadataIndexer object_types.add("origin_visit_status") idx = OriginMetadataIndexer() idx.catch_exceptions = False # don't commit offsets if indexation failed worker_fns.append(idx.process_journal_objects) if indexer in ("extrinsic_metadata", "*"): from swh.indexer.metadata import ExtrinsicMetadataIndexer object_types.add("raw_extrinsic_metadata") idx = ExtrinsicMetadataIndexer() idx.catch_exceptions = False # don't commit offsets if indexation failed worker_fns.append(idx.process_journal_objects) if indexer in ("content_mimetype", "*"): from swh.indexer.mimetype import MimetypeIndexer object_types.add("content") idx = MimetypeIndexer() idx.catch_exceptions = False # don't commit offsets if indexation failed worker_fns.append(idx.process_journal_objects) if indexer in ("content_fossology_license", "*"): from swh.indexer.fossology_license import FossologyLicenseIndexer object_types.add("content") idx = FossologyLicenseIndexer() idx.catch_exceptions = False # don't commit offsets if indexation failed worker_fns.append(idx.process_journal_objects) if not worker_fns: raise click.ClickException(f"Unknown indexer: {indexer}") client = get_journal_client( cls="kafka", brokers=brokers, prefix=prefix, group_id=group_id, object_types=list(object_types), stop_after_objects=stop_after_objects, + batch_size=batch_size, ) def worker_fn(objects: ObjectsDict): for fn in worker_fns: fn(objects) try: client.process(worker_fn) except KeyboardInterrupt: ctx.exit(0) else: print("Done.") finally: client.close() @indexer_cli_group.command("rpc-serve") @click.argument("config-path", required=True) @click.option("--host", default="0.0.0.0", help="Host to run the server") @click.option("--port", default=5007, type=click.INT, help="Binding port of the server") @click.option( "--debug/--nodebug", default=True, help="Indicates if the server should run in debug mode", ) def rpc_server(config_path, host, port, debug): """Starts a Software Heritage Indexer RPC HTTP server.""" from swh.indexer.storage.api.server import app, load_and_check_config api_cfg = load_and_check_config(config_path, type="any") app.config.update(api_cfg) app.run(host, port=int(port), debug=bool(debug)) def main(): return indexer_cli_group(auto_envvar_prefix="SWH_INDEXER") if __name__ == "__main__": main()