diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -220,6 +220,7 @@ "origin-intrinsic-metadata", "extrinsic-metadata", "content-mimetype", + "content-fossology-license", "*", ] ), @@ -336,6 +337,14 @@ idx.catch_exceptions = False # don't commit offsets if indexation failed worker_fns.append(idx.process_journal_objects) + if indexer in ("content-fossology-license", "*"): + from swh.indexer.fossology_license import FossologyLicenseIndexer + + object_types.add("content") + idx = FossologyLicenseIndexer() + idx.catch_exceptions = False # don't commit offsets if indexation failed + worker_fns.append(idx.process_journal_objects) + if not worker_fns: raise click.ClickException(f"Unknown indexer: {indexer}") diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -27,7 +27,14 @@ from swh.model.model import Content, Origin, OriginVisitStatus from .test_metadata import REMD -from .utils import DIRECTORY2, RAW_CONTENTS, REVISION +from .utils import ( + DIRECTORY2, + RAW_CONTENT_IDS, + RAW_CONTENTS, + REVISION, + SHA1_TO_LICENSES, + mock_compute_license, +) def fill_idx_storage(idx_storage: IndexerStorageInterface, nb_rows: int) -> List[int]: @@ -786,7 +793,6 @@ assert len(contents) == len(RAW_CONTENTS) - storage.content_add(contents) journal_writer.write_additions("content", contents) result = cli_runner.invoke( @@ -814,6 +820,91 @@ assert result.output == expected_output results = idx_storage.content_mimetype_get(content_ids) - assert len(results) > 0 + assert len(results) == len(contents) + for result in results: + assert result in expected_results + + +def test_cli_journal_client_index__fossology_license( + cli_runner, + swh_config, + kafka_prefix: str, + kafka_server, + consumer: Consumer, + idx_storage, + obj_storage, + storage, + mocker, + swh_indexer_config, +): + """Test the 'swh indexer journal-client' cli tool.""" + + from swh.indexer import fossology_license + + # Patch + fossology_license.compute_license = mock_compute_license + + journal_writer = get_journal_writer( + "kafka", + brokers=[kafka_server], + prefix=kafka_prefix, + client_id="test producer", + value_sanitizer=lambda object_type, value: value, + flush_timeout=3, # fail early if something is going wrong + ) + + from swh.indexer.storage.model import ContentLicenseRow + + tool = {"id": 1, **swh_indexer_config["tools"]} + + id0, id1, id2 = RAW_CONTENT_IDS + + contents = [] + content_ids = [] + expected_results = [] + for content_id, (raw_content, _, _) in RAW_CONTENTS.items(): + content = Content.from_data(raw_content) + assert content_id == content.sha1 + + contents.append(content) + content_ids.append(content_id) + + expected_results.extend( + [ + ContentLicenseRow(id=content_id, tool=tool, license=license) + for license in SHA1_TO_LICENSES[content_id] + ] + ) + + assert len(contents) == len(RAW_CONTENTS) + + journal_writer.write_additions("content", contents) + + result = cli_runner.invoke( + indexer_cli_group, + [ + "-C", + swh_config, + "journal-client", + "content-fossology-license", + "--broker", + kafka_server, + "--prefix", + kafka_prefix, + "--group-id", + "test-consumer", + "--stop-after-objects", + len(contents), + ], + catch_exceptions=False, + ) + + # Check the output + expected_output = "Done.\n" + assert result.exit_code == 0, result.output + assert result.output == expected_output + + results = idx_storage.content_fossology_license_get(content_ids) + assert len(results) == len(expected_results) for result in results: assert result in expected_results diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py --- a/swh/indexer/tests/test_fossology_license.py +++ b/swh/indexer/tests/test_fossology_license.py @@ -25,8 +25,8 @@ fill_obj_storage, fill_storage, filter_dict, + mock_compute_license, ) -from swh.model.hashutil import hash_to_bytes class BasicTest(unittest.TestCase): @@ -51,15 +51,6 @@ ) -def mock_compute_license(path): - """path is the content identifier""" - if isinstance(id, bytes): - path = path.decode("utf-8") - # path is something like /tmp/tmpXXX/ so we keep only the sha1 part - id_ = path.split("/")[-1] - return {"licenses": SHA1_TO_LICENSES.get(hash_to_bytes(id_), [])} - - CONFIG = { **BASE_TEST_CONFIG, "workdir": "/tmp", diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -791,3 +791,12 @@ actual_results = self.indexer.run(1, 2**512, incremental=False) assert actual_results == {"status": "uneventful"} + + +def mock_compute_license(path): + """path is the content identifier""" + if isinstance(id, bytes): + path = path.decode("utf-8") + # path is something like /tmp/tmpXXX/ so we keep only the sha1 part + id_ = path.split("/")[-1] + return {"licenses": SHA1_TO_LICENSES.get(hash_to_bytes(id_), [])}