Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9339847
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
12 KB
Subscribers
None
View Options
diff --git a/swh/search/tests/test_cli.py b/swh/search/tests/test_cli.py
index e237c5f..d4d2392 100644
--- a/swh/search/tests/test_cli.py
+++ b/swh/search/tests/test_cli.py
@@ -1,394 +1,397 @@
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import copy
from datetime import datetime, timezone
import tempfile
from click.testing import CliRunner
from confluent_kafka import Producer
import pytest
import yaml
from swh.journal.serializers import value_to_kafka
from swh.model.hashutil import hash_to_bytes
from swh.search import get_search
from swh.search.cli import search_cli_group
CLI_CONFIG = """
search:
cls: elasticsearch
hosts:
- '%(elasticsearch_host)s'
indexes:
origin:
index: test
read_alias: test-read
write_alias: test-write
storage:
cls: memory
"""
JOURNAL_OBJECTS_CONFIG_TEMPLATE = """
journal:
brokers:
- {broker}
prefix: {prefix}
group_id: {group_id}
"""
def invoke(catch_exceptions, args, config="", *, elasticsearch_host):
runner = CliRunner()
with tempfile.NamedTemporaryFile("a", suffix=".yml") as config_fd:
config_fd.write(
(CLI_CONFIG + config) % {"elasticsearch_host": elasticsearch_host}
)
config_fd.seek(0)
result = runner.invoke(search_cli_group, ["-C" + config_fd.name] + args)
if not catch_exceptions and result.exception:
print(result.output)
raise result.exception
return result
def test__journal_client__origin(
swh_search, elasticsearch_host: str, kafka_prefix: str, kafka_server
):
"""Tests the re-indexing when origin_batch_size*task_batch_size is a
divisor of nb_origins."""
producer = Producer(
{
"bootstrap.servers": kafka_server,
"client.id": "test search origin producer",
"acks": "all",
}
)
origin_foobar_baz = {
"url": "http://foobar.baz",
}
value = value_to_kafka(origin_foobar_baz)
topic = f"{kafka_prefix}.origin"
producer.produce(topic=topic, key=b"bogus-origin", value=value)
+ producer.flush()
journal_objects_config = JOURNAL_OBJECTS_CONFIG_TEMPLATE.format(
broker=kafka_server, prefix=kafka_prefix, group_id="test-consumer"
)
result = invoke(
False,
[
"journal-client",
"objects",
"--stop-after-objects",
"1",
"--object-type",
"origin",
"--prefix",
kafka_prefix,
],
journal_objects_config,
elasticsearch_host=elasticsearch_host,
)
# Check the output
expected_output = "Processed 1 messages.\nDone.\n"
assert result.exit_code == 0, result.output
assert result.output == expected_output
swh_search.flush()
# searching origin without visit as requirement
actual_page = swh_search.origin_search(url_pattern="foobar")
# We find it
assert actual_page.next_page_token is None
assert actual_page.results == [origin_foobar_baz]
# It's an origin with no visit, searching for it with visit
actual_page = swh_search.origin_search(url_pattern="foobar", with_visit=True)
# returns nothing
assert actual_page.next_page_token is None
assert actual_page.results == []
def test__journal_client__origin_visit_status(
swh_search, elasticsearch_host, kafka_prefix: str, kafka_server
):
"""Subscribing to origin-visit-status should result in swh-search indexation
"""
origin_foobar = {"url": "http://baz.foobar"}
producer = Producer(
{
"bootstrap.servers": kafka_server,
"client.id": "test search origin visit status producer",
"acks": "all",
}
)
topic = f"{kafka_prefix}.origin_visit_status"
value = value_to_kafka(
{
"origin": origin_foobar["url"],
"visit": 1,
"type": "git",
"date": datetime.now(tz=timezone.utc),
"snapshot": None,
"status": "full",
}
)
producer.produce(topic=topic, key=b"bogus-origin-visit-status", value=value)
+ producer.flush()
journal_objects_config = JOURNAL_OBJECTS_CONFIG_TEMPLATE.format(
broker=kafka_server, prefix=kafka_prefix, group_id="test-consumer"
)
result = invoke(
False,
[
"journal-client",
"objects",
"--stop-after-objects",
"1",
"--prefix",
kafka_prefix,
"--object-type",
"origin_visit_status",
],
journal_objects_config,
elasticsearch_host=elasticsearch_host,
)
# Check the output
expected_output = "Processed 1 messages.\nDone.\n"
assert result.exit_code == 0, result.output
assert result.output == expected_output
swh_search.flush()
# Both search returns the visit
actual_page = swh_search.origin_search(url_pattern="foobar", with_visit=False)
assert actual_page.next_page_token is None
assert actual_page.results == [origin_foobar]
actual_page = swh_search.origin_search(url_pattern="foobar", with_visit=True)
assert actual_page.next_page_token is None
assert actual_page.results == [origin_foobar]
def test__journal_client__origin_intrinsic_metadata(
swh_search, elasticsearch_host, kafka_prefix: str, kafka_server
):
"""Subscribing to origin-intrinsic-metadata should result in swh-search indexation
"""
origin_foobar = {"url": "https://github.com/clojure/clojure"}
origin_intrinsic_metadata = {
"id": origin_foobar["url"],
"metadata": {
"name": "clojure",
"type": "SoftwareSourceCode",
"license": "http://opensource.org/licenses/eclipse-1.0.php",
"version": "1.10.2-master-SNAPSHOT",
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"identifier": "org.clojure",
"description": "Clojure core environment and runtime library.",
"codeRepository": "https://repo.maven.apache.org/maven2/org/clojure/clojure", # noqa
},
"indexer_configuration_id": 1,
"from_revision": hash_to_bytes("f47c139e20970ee0852166f48ee2a4626632b86e"),
"mappings": ["maven"],
}
producer = Producer(
{
"bootstrap.servers": kafka_server,
"client.id": "test search origin intrinsic metadata producer",
"acks": "all",
}
)
topic = f"{kafka_prefix}.origin_intrinsic_metadata"
value = value_to_kafka(origin_intrinsic_metadata)
producer.produce(topic=topic, key=b"bogus-origin-intrinsic-metadata", value=value)
+ producer.flush()
journal_objects_config = JOURNAL_OBJECTS_CONFIG_TEMPLATE.format(
broker=kafka_server, prefix=kafka_prefix, group_id="test-consumer"
)
result = invoke(
False,
[
"journal-client",
"objects",
"--stop-after-objects",
"1",
"--object-type",
"origin_intrinsic_metadata",
],
journal_objects_config,
elasticsearch_host=elasticsearch_host,
)
# Check the output
expected_output = "Processed 1 messages.\nDone.\n"
assert result.exit_code == 0, result.output
assert result.output == expected_output
swh_search.flush()
# search without visit returns the metadata
actual_page = swh_search.origin_search(url_pattern="clojure", with_visit=False)
assert actual_page.next_page_token is None
assert actual_page.results == [origin_foobar]
# no visit associated so it does not return anything
actual_page = swh_search.origin_search(url_pattern="clojure", with_visit=True)
assert actual_page.next_page_token is None
assert actual_page.results == []
def test__journal_client__missing_main_journal_config_key(elasticsearch_host):
"""Missing configuration on journal should raise"""
with pytest.raises(KeyError, match="journal"):
invoke(
catch_exceptions=False,
args=["journal-client", "objects", "--stop-after-objects", "1",],
config="", # missing config will make it raise
elasticsearch_host=elasticsearch_host,
)
def test__journal_client__missing_journal_config_keys(elasticsearch_host):
"""Missing configuration on mandatory journal keys should raise"""
kafka_prefix = "swh.journal.objects"
journal_objects_config = JOURNAL_OBJECTS_CONFIG_TEMPLATE.format(
broker="192.0.2.1", prefix=kafka_prefix, group_id="test-consumer"
)
journal_config = yaml.safe_load(journal_objects_config)
for key in journal_config["journal"].keys():
if key == "prefix": # optional
continue
cfg = copy.deepcopy(journal_config)
del cfg["journal"][key] # make config incomplete
yaml_cfg = yaml.dump(cfg)
with pytest.raises(TypeError, match=f"{key}"):
invoke(
catch_exceptions=False,
args=[
"journal-client",
"objects",
"--stop-after-objects",
"1",
"--prefix",
kafka_prefix,
"--object-type",
"origin_visit_status",
],
config=yaml_cfg, # incomplete config will make the cli raise
elasticsearch_host=elasticsearch_host,
)
def test__journal_client__missing_prefix_config_key(
swh_search, elasticsearch_host, kafka_server
):
"""Missing configuration on mandatory prefix key should raise"""
journal_cfg_template = """
journal:
brokers:
- {broker}
group_id: {group_id}
"""
journal_cfg = journal_cfg_template.format(
broker=kafka_server, group_id="test-consumer"
)
with pytest.raises(ValueError, match="prefix"):
invoke(
False,
# Missing --prefix (and no config key) will make the cli raise
[
"journal-client",
"objects",
"--stop-after-objects",
"1",
"--object-type",
"origin_visit_status",
],
journal_cfg,
elasticsearch_host=elasticsearch_host,
)
def test__journal_client__missing_object_types_config_key(
swh_search, elasticsearch_host, kafka_server
):
"""Missing configuration on mandatory object-types key should raise"""
journal_cfg_template = """
journal:
brokers:
- {broker}
prefix: swh.journal.objects
group_id: {group_id}
"""
journal_cfg = journal_cfg_template.format(
broker=kafka_server, group_id="test-consumer"
)
with pytest.raises(ValueError, match="object_types"):
invoke(
False,
# Missing --object-types (and no config key) will make the cli raise
["journal-client", "objects", "--stop-after-objects", "1"],
journal_cfg,
elasticsearch_host=elasticsearch_host,
)
def test__initialize__with_index_name(elasticsearch_host):
"""Initializing the index with an index name should create the right index"""
search = get_search(
"elasticsearch",
hosts=[elasticsearch_host],
indexes={"origin": {"index": "test"}},
)
assert search._get_origin_index() == "test"
assert search._get_origin_read_alias() == "origin-read"
assert search._get_origin_write_alias() == "origin-write"
def test__initialize__with_read_alias(elasticsearch_host):
"""Initializing the index with a search alias name should create
the right search alias"""
search = get_search(
"elasticsearch",
hosts=[elasticsearch_host],
indexes={"origin": {"read_alias": "test"}},
)
assert search._get_origin_index() == "origin"
assert search._get_origin_read_alias() == "test"
assert search._get_origin_write_alias() == "origin-write"
def test__initialize__with_write_alias(elasticsearch_host):
"""Initializing the index with an indexing alias name should create
the right indexing alias"""
search = get_search(
"elasticsearch",
hosts=[elasticsearch_host],
indexes={"origin": {"write_alias": "test"}},
)
assert search._get_origin_index() == "origin"
assert search._get_origin_read_alias() == "origin-read"
assert search._get_origin_write_alias() == "test"
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Jul 4 2025, 9:56 AM (5 w, 9 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3273035
Attached To
rDSEA Archive search
Event Timeline
Log In to Comment