D5064.id18076.diff
No OneTemporary
Actions

Size

12 KB

Subscribers

None

D5064.id18076.diff
View Options

	diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py
	--- a/swh/search/elasticsearch.py
	+++ b/swh/search/elasticsearch.py
	@@ -1,4 +1,4 @@
	-# Copyright (C) 2019-2020 The Software Heritage developers
	+# Copyright (C) 2019-2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	@@ -21,7 +21,7 @@

	# Whitelist fields to be saved in Elasticsearch
	res = {"url": origin.pop("url")}
	- for field_name in ("intrinsic_metadata", "has_visits"):
	+ for field_name in ("intrinsic_metadata", "has_visits", "visit_types"):
	if field_name in origin:
	res[field_name] = origin.pop(field_name)

	@@ -101,6 +101,7 @@
	}
	},
	},
	+ "visit_types": {"type": "keyword"},
	# used to filter out origins that were never visited
	"has_visits": {"type": "boolean",},
	"intrinsic_metadata": {
	@@ -125,16 +126,38 @@
	documents_with_sha1 = (
	(origin_identifier(document), document) for document in documents
	)
	+ update_script = """
	+ for (int i = 0; i < params.visit_types.length; ++i) {
	+ if (!ctx._source.visit_types.contains(params.visit_types[i])) {
	+ ctx._source.visit_types.add(params.visit_types[i]);
	+ }
	+ }
	+ if (!ctx._source.containsKey("has_visits")) {
	+ ctx._source.has_visits = params.has_visits;
	+ } else {
	+ ctx._source.has_visits = ctx._source.has_visits \|\| params.has_visits;
	+ }
	+ """
	+
	actions = [
	{
	"_op_type": "update",
	"_id": sha1,
	"_index": self.origin_index,
	- "doc": {**document, "sha1": sha1,},
	- "doc_as_upsert": True,
	+ "scripted_upsert": True,
	+ "upsert": {**document, "sha1": sha1,},
	+ "script": {
	+ "source": update_script,
	+ "lang": "painless",
	+ "params": {
	+ "visit_types": document.get("visit_types", []),
	+ "has_visits": document.get("has_visits", False),
	+ },
	+ },
	}
	for (sha1, document) in documents_with_sha1
	]
	+
	bulk(self._backend, actions, index=self.origin_index)

	def origin_dump(self) -> Iterator[model.Origin]:
	@@ -150,6 +173,7 @@
	url_pattern: Optional[str] = None,
	metadata_pattern: Optional[str] = None,
	with_visit: bool = False,
	+ visit_types: Optional[List[str]] = None,
	page_token: Optional[str] = None,
	limit: int = 50,
	) -> PagedResult[Dict[str, Any]]:
	@@ -204,6 +228,9 @@
	if with_visit:
	query_clauses.append({"term": {"has_visits": True,}})

	+ if visit_types is not None:
	+ query_clauses.append({"terms": {"visit_types": visit_types}})
	+
	body = {
	"query": {"bool": {"must": query_clauses,}},
	"sort": [{"_score": "desc"}, {"sha1": "asc"},],
	diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py
	--- a/swh/search/in_memory.py
	+++ b/swh/search/in_memory.py
	@@ -1,4 +1,4 @@
	-# Copyright (C) 2019-2020 The Software Heritage developers
	+# Copyright (C) 2019-2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	@@ -39,7 +39,16 @@
	id_ = origin_identifier(document)
	if "url" in document:
	document["_url_tokens"] = set(self._url_splitter.split(document["url"]))
	+ if "visit_types" in self._origins[id_]:
	+ document = dict(document)
	+ document["visit_types"] = list(
	+ set(
	+ self._origins[id_]["visit_types"]
	+ + document.get("visit_types", [])
	+ )
	+ )
	self._origins[id_].update(document)
	+
	if id_ not in self._origin_ids:
	self._origin_ids.append(id_)

	@@ -49,6 +58,7 @@
	url_pattern: Optional[str] = None,
	metadata_pattern: Optional[str] = None,
	with_visit: bool = False,
	+ visit_types: Optional[List[str]] = None,
	page_token: Optional[str] = None,
	limit: int = 50,
	) -> PagedResult[Dict[str, Any]]:
	@@ -90,6 +100,12 @@
	if with_visit:
	hits = filter(lambda o: o.get("has_visits"), hits)

	+ if visit_types is not None:
	+ visit_types_set = set(visit_types)
	+ hits = filter(
	+ lambda o: not (visit_types_set - set(o.get("visit_types", []))), hits
	+ )
	+
	start_at_index = int(page_token) if page_token else 0

	origins = [
	diff --git a/swh/search/interface.py b/swh/search/interface.py
	--- a/swh/search/interface.py
	+++ b/swh/search/interface.py
	@@ -1,9 +1,9 @@
	-# Copyright (C) 2020 The Software Heritage developers
	+# Copyright (C) 2020-2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	-from typing import Any, Dict, Iterable, Optional, TypeVar
	+from typing import Any, Dict, Iterable, List, Optional, TypeVar

	from swh.core.api import remote_api_endpoint
	from swh.core.api.classes import PagedResult as CorePagedResult
	@@ -42,6 +42,7 @@
	url_pattern: Optional[str] = None,
	metadata_pattern: Optional[str] = None,
	with_visit: bool = False,
	+ visit_types: Optional[List[str]] = None,
	page_token: Optional[str] = None,
	limit: int = 50,
	) -> PagedResult[Dict[str, Any]]:
	@@ -51,6 +52,8 @@
	url_pattern: Part of the URL to search for
	with_visit: Whether origins with no visit are to be
	filtered out
	+ visit_types: Only origins with given visit types (e.g. git, svn, pypi)
	+ will be returned
	page_token: Opaque value used for pagination
	limit: number of results to return

	diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py
	--- a/swh/search/journal_client.py
	+++ b/swh/search/journal_client.py
	@@ -1,4 +1,4 @@
	-# Copyright (C) 2018-2019 The Software Heritage developers
	+# Copyright (C) 2018-2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	@@ -48,6 +48,7 @@
	if isinstance(visit["origin"], str)
	else visit["origin"]["url"]
	),
	+ "visit_types": [visit["type"]],
	}
	for visit in visits
	]
	diff --git a/swh/search/tests/test_cli.py b/swh/search/tests/test_cli.py
	--- a/swh/search/tests/test_cli.py
	+++ b/swh/search/tests/test_cli.py
	@@ -19,10 +19,9 @@
	CLI_CONFIG = """
	search:
	cls: elasticsearch
	- args:
	- hosts:
	- - '%(elasticsearch_host)s'
	- index_prefix: test
	+ hosts:
	+ - '%(elasticsearch_host)s'
	+ index_prefix: test
	"""

	JOURNAL_OBJECTS_CONFIG_TEMPLATE = """
	@@ -120,7 +119,7 @@
	}
	)
	topic = f"{kafka_prefix}.origin_visit"
	- value = value_to_kafka({"origin": origin_foobar["url"]})
	+ value = value_to_kafka({"origin": origin_foobar["url"], "type": "git"})
	producer.produce(topic=topic, key=b"bogus-origin-visit", value=value)

	journal_objects_config = JOURNAL_OBJECTS_CONFIG_TEMPLATE.format(
	diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py
	--- a/swh/search/tests/test_journal_client.py
	+++ b/swh/search/tests/test_journal_client.py
	@@ -1,4 +1,4 @@
	-# Copyright (C) 2019-2020 The Software Heritage developers
	+# Copyright (C) 2019-2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	@@ -32,9 +32,9 @@

	worker_fn = functools.partial(process_journal_objects, search=search_mock,)

	- worker_fn({"origin_visit": [{"origin": {"url": "http://foobar.baz"},}]})
	+ worker_fn({"origin_visit": [{"origin": "http://foobar.baz", "type": "git"},]})
	search_mock.origin_update.assert_called_once_with(
	- [{"url": "http://foobar.baz"},]
	+ [{"url": "http://foobar.baz", "visit_types": ["git"]},]
	)


	diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
	--- a/swh/search/tests/test_search.py
	+++ b/swh/search/tests/test_search.py
	@@ -1,8 +1,10 @@
	-# Copyright (C) 2019-2020 The Software Heritage developers
	+# Copyright (C) 2019-2021 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	+from itertools import combinations
	+
	from hypothesis import given, settings, strategies

	from swh.core.api.classes import stream_results
	@@ -101,6 +103,71 @@
	assert actual_page.next_page_token is None
	assert actual_page.results == [origin_foobar_baz]

	+ def test_origin_visit_types_search(self):
	+ origins = [
	+ {"url": "http://foobar.baz", "visit_types": ["git"]},
	+ {"url": "http://barbaz.qux", "visit_types": ["svn"]},
	+ {"url": "http://qux.quux", "visit_types": ["hg"]},
	+ ]
	+
	+ self.search.origin_update(origins)
	+ self.search.flush()
	+
	+ for origin in origins:
	+ actual_page = self.search.origin_search(
	+ url_pattern="http", visit_types=origin["visit_types"]
	+ )
	+ assert actual_page.next_page_token is None
	+ results = [r["url"] for r in actual_page.results]
	+ expected_results = [origin["url"]]
	+ assert sorted(results) == sorted(expected_results)
	+
	+ actual_page = self.search.origin_search(url_pattern="http", visit_types=None)
	+ assert actual_page.next_page_token is None
	+ results = [r["url"] for r in actual_page.results]
	+ expected_results = [origin["url"] for origin in origins]
	+ assert sorted(results) == sorted(expected_results)
	+
	+ def test_origin_with_multiple_visit_types_search(self):
	+ origins = [
	+ {"url": "http://foobar.baz", "visit_types": ["git", "svn"]},
	+ {"url": "http://foobar.baz", "visit_types": ["hg"]},
	+ ]
	+
	+ self.search.origin_update(origins)
	+ self.search.flush()
	+
	+ for visit_type in ("hg", "svn", "git"):
	+ actual_page = self.search.origin_search(
	+ url_pattern="http", visit_types=[visit_type]
	+ )
	+ assert actual_page.next_page_token is None
	+ results = [r["url"] for r in actual_page.results]
	+ expected_results = [origins[0]["url"]]
	+ assert sorted(results) == sorted(expected_results)
	+
	+ def test_origins_with_multiple_visit_types_search(self):
	+ origins = [
	+ {"url": "http://foobar.baz", "visit_types": ["git", "hg", "svn"]},
	+ {"url": "http://barbaz.qux", "visit_types": ["hg", "svn", "git"]},
	+ {"url": "http://qux.quux", "visit_types": ["hg", "git", "svn"]},
	+ ]
	+
	+ self.search.origin_update(origins)
	+ self.search.flush()
	+
	+ visit_types = ["hg", "svn", "git"]
	+
	+ for i in range(len(visit_types)):
	+ for types in combinations(visit_types, r=i + 1):
	+ actual_page = self.search.origin_search(
	+ url_pattern="http", visit_types=list(types)
	+ )
	+ assert actual_page.next_page_token is None
	+ results = [r["url"] for r in actual_page.results]
	+ expected_results = [origin["url"] for origin in origins]
	+ assert sorted(results) == sorted(expected_results)
	+
	def test_origin_intrinsic_metadata_description(self):
	origin1_nothin = {"url": "http://origin1"}
	origin2_foobar = {"url": "http://origin2"}

File Metadata

Mime Type: text/plain
Expires: Thu, Jul 3, 1:37 PM (1 w, 8 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3232101

D5064.id18076.diffNo OneTemporaryActions

D5064.id18076.diffView Options

File Metadata

Event Timeline

D5064.id18076.diff
No OneTemporary
Actions

D5064.id18076.diff
View Options