diff --git a/PKG-INFO b/PKG-INFO index 9732f47..43d1939 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,71 +1,71 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.6.0 +Version: 0.6.1 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.indexer.egg-info/PKG-INFO b/swh.indexer.egg-info/PKG-INFO index 9732f47..43d1939 100644 --- a/swh.indexer.egg-info/PKG-INFO +++ b/swh.indexer.egg-info/PKG-INFO @@ -1,71 +1,71 @@ Metadata-Version: 2.1 Name: swh.indexer -Version: 0.6.0 +Version: 0.6.1 Summary: Software Heritage Content Indexer Home-page: https://forge.softwareheritage.org/diffusion/78/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-indexer Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-indexer/ Description: swh-indexer ============ Tools to compute multiple indexes on SWH's raw contents: - content: - mimetype - ctags - language - fossology-license - metadata - revision: - metadata An indexer is in charge of: - looking up objects - extracting information from those objects - store those information in the swh-indexer db There are multiple indexers working on different object types: - content indexer: works with content sha1 hashes - revision indexer: works with revision sha1 hashes - origin indexer: works with origin identifiers Indexation procedure: - receive batch of ids - retrieve the associated data depending on object type - compute for that object some index - store the result to swh's storage Current content indexers: - mimetype (queue swh_indexer_content_mimetype): detect the encoding and mimetype - language (queue swh_indexer_content_language): detect the programming language - ctags (queue swh_indexer_content_ctags): compute tags information - fossology-license (queue swh_indexer_fossology_license): compute the license - metadata: translate file into translated_metadata dict Current revision indexers: - metadata: detects files containing metadata and retrieves translated_metadata in content_metadata table in storage or run content indexer to translate files. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh/indexer/tests/test_journal_client.py b/swh/indexer/tests/test_journal_client.py index 38e4386..21e5e0b 100644 --- a/swh/indexer/tests/test_journal_client.py +++ b/swh/indexer/tests/test_journal_client.py @@ -1,153 +1,132 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import unittest + from unittest.mock import Mock, patch from swh.indexer.journal_client import process_journal_objects -class JournalClientTest(unittest.TestCase): - def test_one_origin_visit_status(self): - mock_scheduler = Mock() - messages = { - "origin_visit_status": [{"status": "full", "origin": "file:///dev/zero",},] - } - process_journal_objects( - messages, - scheduler=mock_scheduler, - task_names={"origin_metadata": "task-name"}, - ) - self.assertTrue(mock_scheduler.create_tasks.called) - call_args = mock_scheduler.create_tasks.call_args - (args, kwargs) = call_args - self.assertEqual(kwargs, {}) - del args[0][0]["next_run"] - self.assertEqual( - args, - ( - [ - { - "arguments": {"kwargs": {}, "args": (["file:///dev/zero"],),}, - "policy": "oneshot", - "type": "task-name", - "retries_left": 1, - }, - ], - ), - ) +def test_one_origin_visit_status(): + mock_scheduler = Mock() + messages = { + "origin_visit_status": [{"status": "full", "origin": "file:///dev/zero",},] + } + process_journal_objects( + messages, scheduler=mock_scheduler, task_names={"origin_metadata": "task-name"}, + ) + assert mock_scheduler.create_tasks.called is True + call_args = mock_scheduler.create_tasks.call_args + (args, kwargs) = call_args + assert kwargs == {} + del args[0][0]["next_run"] + assert args == ( + [ + { + "arguments": {"kwargs": {}, "args": (["file:///dev/zero"],),}, + "policy": "oneshot", + "type": "task-name", + "retries_left": 1, + }, + ], + ) + + +def test_origin_visit_legacy(): + mock_scheduler = Mock() + messages = { + "origin_visit_status": [ + {"status": "full", "origin": {"url": "file:///dev/zero",}}, + ] + } + process_journal_objects( + messages, scheduler=mock_scheduler, task_names={"origin_metadata": "task-name"}, + ) + assert mock_scheduler.create_tasks.called is True + call_args = mock_scheduler.create_tasks.call_args + (args, kwargs) = call_args + assert kwargs == {} + del args[0][0]["next_run"] + assert args == ( + [ + { + "arguments": {"kwargs": {}, "args": (["file:///dev/zero"],),}, + "policy": "oneshot", + "type": "task-name", + "retries_left": 1, + }, + ], + ) + - def test_origin_visit_legacy(self): - mock_scheduler = Mock() - messages = { - "origin_visit_status": [ - {"status": "full", "origin": {"url": "file:///dev/zero",}}, - ] - } - process_journal_objects( - messages, - scheduler=mock_scheduler, - task_names={"origin_metadata": "task-name"}, - ) - self.assertTrue(mock_scheduler.create_tasks.called) - call_args = mock_scheduler.create_tasks.call_args - (args, kwargs) = call_args - self.assertEqual(kwargs, {}) - del args[0][0]["next_run"] - self.assertEqual( - args, - ( - [ - { - "arguments": {"kwargs": {}, "args": (["file:///dev/zero"],),}, - "policy": "oneshot", - "type": "task-name", - "retries_left": 1, - }, - ], - ), - ) +def test_one_origin_visit_batch(): + mock_scheduler = Mock() + messages = { + "origin_visit_status": [ + {"status": "full", "origin": "file:///dev/zero",}, + {"status": "full", "origin": "file:///tmp/foobar",}, + ] + } + process_journal_objects( + messages, scheduler=mock_scheduler, task_names={"origin_metadata": "task-name"}, + ) + assert mock_scheduler.create_tasks.called is True + call_args = mock_scheduler.create_tasks.call_args + (args, kwargs) = call_args + assert kwargs == {} + del args[0][0]["next_run"] + assert args == ( + [ + { + "arguments": { + "kwargs": {}, + "args": (["file:///dev/zero", "file:///tmp/foobar"],), + }, + "policy": "oneshot", + "type": "task-name", + "retries_left": 1, + }, + ], + ) - def test_one_origin_visit_batch(self): - mock_scheduler = Mock() - messages = { - "origin_visit_status": [ - {"status": "full", "origin": "file:///dev/zero",}, - {"status": "full", "origin": "file:///tmp/foobar",}, - ] - } - process_journal_objects( - messages, - scheduler=mock_scheduler, - task_names={"origin_metadata": "task-name"}, - ) - self.assertTrue(mock_scheduler.create_tasks.called) - call_args = mock_scheduler.create_tasks.call_args - (args, kwargs) = call_args - self.assertEqual(kwargs, {}) - del args[0][0]["next_run"] - self.assertEqual( - args, - ( - [ - { - "arguments": { - "kwargs": {}, - "args": (["file:///dev/zero", "file:///tmp/foobar"],), - }, - "policy": "oneshot", - "type": "task-name", - "retries_left": 1, - }, - ], - ), - ) - @patch("swh.indexer.journal_client.MAX_ORIGINS_PER_TASK", 2) - def test_origin_visit_batches(self): - mock_scheduler = Mock() - messages = { - "origin_visit_status": [ - {"status": "full", "origin": "file:///dev/zero",}, - {"status": "full", "origin": "file:///tmp/foobar",}, - {"status": "full", "origin": "file:///tmp/spamegg",}, - ] - } - process_journal_objects( - messages, - scheduler=mock_scheduler, - task_names={"origin_metadata": "task-name"}, - ) - self.assertTrue(mock_scheduler.create_tasks.called) - call_args = mock_scheduler.create_tasks.call_args - (args, kwargs) = call_args - self.assertEqual(kwargs, {}) - del args[0][0]["next_run"] - del args[0][1]["next_run"] - self.assertEqual( - args, - ( - [ - { - "arguments": { - "kwargs": {}, - "args": (["file:///dev/zero", "file:///tmp/foobar"],), - }, - "policy": "oneshot", - "type": "task-name", - "retries_left": 1, - }, - { - "arguments": { - "kwargs": {}, - "args": (["file:///tmp/spamegg"],), - }, - "policy": "oneshot", - "type": "task-name", - "retries_left": 1, - }, - ], - ), - ) +@patch("swh.indexer.journal_client.MAX_ORIGINS_PER_TASK", 2) +def test_origin_visit_batches(): + mock_scheduler = Mock() + messages = { + "origin_visit_status": [ + {"status": "full", "origin": "file:///dev/zero",}, + {"status": "full", "origin": "file:///tmp/foobar",}, + {"status": "full", "origin": "file:///tmp/spamegg",}, + ] + } + process_journal_objects( + messages, scheduler=mock_scheduler, task_names={"origin_metadata": "task-name"}, + ) + assert mock_scheduler.create_tasks.called is True + call_args = mock_scheduler.create_tasks.call_args + (args, kwargs) = call_args + assert kwargs == {} + del args[0][0]["next_run"] + del args[0][1]["next_run"] + assert args == ( + [ + { + "arguments": { + "kwargs": {}, + "args": (["file:///dev/zero", "file:///tmp/foobar"],), + }, + "policy": "oneshot", + "type": "task-name", + "retries_left": 1, + }, + { + "arguments": {"kwargs": {}, "args": (["file:///tmp/spamegg"],),}, + "policy": "oneshot", + "type": "task-name", + "retries_left": 1, + }, + ], + ) diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py index d3aef57..73d8d41 100644 --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -1,125 +1,140 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict import unittest import pytest from swh.indexer.mimetype import ( MimetypeIndexer, MimetypePartitionIndexer, compute_mimetype_encoding, ) from swh.indexer.storage.model import ContentMimetypeRow from swh.indexer.tests.utils import ( BASE_TEST_CONFIG, CommonContentIndexerPartitionTest, CommonContentIndexerTest, fill_obj_storage, fill_storage, filter_dict, ) from swh.model.hashutil import hash_to_bytes -def test_compute_mimetype_encoding(): - """Compute mimetype encoding should return results""" - for _input, _mimetype, _encoding in [ +@pytest.mark.parametrize( + "raw_text,mimetype,encoding", + [ ("du français".encode(), "text/plain", "utf-8"), - (b"def __init__(self):", "text/x-python", "us-ascii"), + (b"def __init__(self):", ("text/x-python", "text/x-script.python"), "us-ascii"), (b"\xff\xfe\x00\x00\x00\x00\xff\xfe\xff\xff", "application/octet-stream", ""), - ]: - actual_result = compute_mimetype_encoding(_input) - assert actual_result == {"mimetype": _mimetype, "encoding": _encoding} + ], +) +def test_compute_mimetype_encoding(raw_text, mimetype, encoding): + """Compute mimetype encoding should return results""" + actual_result = compute_mimetype_encoding(raw_text) + if isinstance(mimetype, tuple): + # New magic version can return different results, this deals with such a case + expected_result = {"mimetype": mimetype[0], "encoding": encoding} + # as a fallback + fallback_expected_result = {"mimetype": mimetype[1], "encoding": encoding} + else: + expected_result = {"mimetype": mimetype, "encoding": encoding} + fallback_expected_result = expected_result + + try: + assert actual_result == expected_result + except AssertionError: + assert actual_result == fallback_expected_result CONFIG = { **BASE_TEST_CONFIG, "tools": { "name": "file", "version": "1:5.30-1+deb9u1", "configuration": {"type": "library", "debian-package": "python3-magic"}, }, } # type: Dict[str, Any] class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase): """Mimetype indexer test scenarios: - Known sha1s in the input list have their data indexed - Unknown sha1 in the input list are not indexed """ def get_indexer_results(self, ids): yield from self.idx_storage.content_mimetype_get(ids) def setUp(self): self.indexer = MimetypeIndexer(config=CONFIG) self.indexer.catch_exceptions = False self.idx_storage = self.indexer.idx_storage fill_storage(self.indexer.storage) fill_obj_storage(self.indexer.objstorage) self.id0 = "01c9379dfc33803963d07c1ccc748d3fe4c96bb5" self.id1 = "688a5ef812c53907562fe379d4b3851e69c7cb15" self.id2 = "da39a3ee5e6b4b0d3255bfef95601890afd80709" tool = {k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items()} self.expected_results = [ ContentMimetypeRow( id=hash_to_bytes(self.id0), tool=tool, mimetype="text/plain", encoding="us-ascii", ), ContentMimetypeRow( id=hash_to_bytes(self.id1), tool=tool, mimetype="text/plain", encoding="us-ascii", ), ContentMimetypeRow( id=hash_to_bytes(self.id2), tool=tool, mimetype="application/x-empty", encoding="binary", ), ] RANGE_CONFIG = dict(list(CONFIG.items()) + [("write_batch_size", 100)]) class TestMimetypePartitionIndexer( CommonContentIndexerPartitionTest, unittest.TestCase ): """Range Mimetype Indexer tests. - new data within range are indexed - no data outside a range are indexed - with filtering existing indexed data prior to compute new index - without filtering existing indexed data prior to compute new index """ def setUp(self): super().setUp() self.indexer = MimetypePartitionIndexer(config=RANGE_CONFIG) self.indexer.catch_exceptions = False fill_storage(self.indexer.storage) fill_obj_storage(self.indexer.objstorage) def test_mimetype_w_no_tool(): with pytest.raises(ValueError): MimetypeIndexer(config=filter_dict(CONFIG, "tools")) def test_mimetype_range_w_no_tool(): with pytest.raises(ValueError): MimetypePartitionIndexer(config=filter_dict(CONFIG, "tools"))