Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9343524
D5064.id18076.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
12 KB
Subscribers
None
D5064.id18076.diff
View Options
diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py
--- a/swh/search/elasticsearch.py
+++ b/swh/search/elasticsearch.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -21,7 +21,7 @@
# Whitelist fields to be saved in Elasticsearch
res = {"url": origin.pop("url")}
- for field_name in ("intrinsic_metadata", "has_visits"):
+ for field_name in ("intrinsic_metadata", "has_visits", "visit_types"):
if field_name in origin:
res[field_name] = origin.pop(field_name)
@@ -101,6 +101,7 @@
}
},
},
+ "visit_types": {"type": "keyword"},
# used to filter out origins that were never visited
"has_visits": {"type": "boolean",},
"intrinsic_metadata": {
@@ -125,16 +126,38 @@
documents_with_sha1 = (
(origin_identifier(document), document) for document in documents
)
+ update_script = """
+ for (int i = 0; i < params.visit_types.length; ++i) {
+ if (!ctx._source.visit_types.contains(params.visit_types[i])) {
+ ctx._source.visit_types.add(params.visit_types[i]);
+ }
+ }
+ if (!ctx._source.containsKey("has_visits")) {
+ ctx._source.has_visits = params.has_visits;
+ } else {
+ ctx._source.has_visits = ctx._source.has_visits || params.has_visits;
+ }
+ """
+
actions = [
{
"_op_type": "update",
"_id": sha1,
"_index": self.origin_index,
- "doc": {**document, "sha1": sha1,},
- "doc_as_upsert": True,
+ "scripted_upsert": True,
+ "upsert": {**document, "sha1": sha1,},
+ "script": {
+ "source": update_script,
+ "lang": "painless",
+ "params": {
+ "visit_types": document.get("visit_types", []),
+ "has_visits": document.get("has_visits", False),
+ },
+ },
}
for (sha1, document) in documents_with_sha1
]
+
bulk(self._backend, actions, index=self.origin_index)
def origin_dump(self) -> Iterator[model.Origin]:
@@ -150,6 +173,7 @@
url_pattern: Optional[str] = None,
metadata_pattern: Optional[str] = None,
with_visit: bool = False,
+ visit_types: Optional[List[str]] = None,
page_token: Optional[str] = None,
limit: int = 50,
) -> PagedResult[Dict[str, Any]]:
@@ -204,6 +228,9 @@
if with_visit:
query_clauses.append({"term": {"has_visits": True,}})
+ if visit_types is not None:
+ query_clauses.append({"terms": {"visit_types": visit_types}})
+
body = {
"query": {"bool": {"must": query_clauses,}},
"sort": [{"_score": "desc"}, {"sha1": "asc"},],
diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py
--- a/swh/search/in_memory.py
+++ b/swh/search/in_memory.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -39,7 +39,16 @@
id_ = origin_identifier(document)
if "url" in document:
document["_url_tokens"] = set(self._url_splitter.split(document["url"]))
+ if "visit_types" in self._origins[id_]:
+ document = dict(document)
+ document["visit_types"] = list(
+ set(
+ self._origins[id_]["visit_types"]
+ + document.get("visit_types", [])
+ )
+ )
self._origins[id_].update(document)
+
if id_ not in self._origin_ids:
self._origin_ids.append(id_)
@@ -49,6 +58,7 @@
url_pattern: Optional[str] = None,
metadata_pattern: Optional[str] = None,
with_visit: bool = False,
+ visit_types: Optional[List[str]] = None,
page_token: Optional[str] = None,
limit: int = 50,
) -> PagedResult[Dict[str, Any]]:
@@ -90,6 +100,12 @@
if with_visit:
hits = filter(lambda o: o.get("has_visits"), hits)
+ if visit_types is not None:
+ visit_types_set = set(visit_types)
+ hits = filter(
+ lambda o: not (visit_types_set - set(o.get("visit_types", []))), hits
+ )
+
start_at_index = int(page_token) if page_token else 0
origins = [
diff --git a/swh/search/interface.py b/swh/search/interface.py
--- a/swh/search/interface.py
+++ b/swh/search/interface.py
@@ -1,9 +1,9 @@
-# Copyright (C) 2020 The Software Heritage developers
+# Copyright (C) 2020-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from typing import Any, Dict, Iterable, Optional, TypeVar
+from typing import Any, Dict, Iterable, List, Optional, TypeVar
from swh.core.api import remote_api_endpoint
from swh.core.api.classes import PagedResult as CorePagedResult
@@ -42,6 +42,7 @@
url_pattern: Optional[str] = None,
metadata_pattern: Optional[str] = None,
with_visit: bool = False,
+ visit_types: Optional[List[str]] = None,
page_token: Optional[str] = None,
limit: int = 50,
) -> PagedResult[Dict[str, Any]]:
@@ -51,6 +52,8 @@
url_pattern: Part of the URL to search for
with_visit: Whether origins with no visit are to be
filtered out
+ visit_types: Only origins with given visit types (e.g. git, svn, pypi)
+ will be returned
page_token: Opaque value used for pagination
limit: number of results to return
diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py
--- a/swh/search/journal_client.py
+++ b/swh/search/journal_client.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2019 The Software Heritage developers
+# Copyright (C) 2018-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -48,6 +48,7 @@
if isinstance(visit["origin"], str)
else visit["origin"]["url"]
),
+ "visit_types": [visit["type"]],
}
for visit in visits
]
diff --git a/swh/search/tests/test_cli.py b/swh/search/tests/test_cli.py
--- a/swh/search/tests/test_cli.py
+++ b/swh/search/tests/test_cli.py
@@ -19,10 +19,9 @@
CLI_CONFIG = """
search:
cls: elasticsearch
- args:
- hosts:
- - '%(elasticsearch_host)s'
- index_prefix: test
+ hosts:
+ - '%(elasticsearch_host)s'
+ index_prefix: test
"""
JOURNAL_OBJECTS_CONFIG_TEMPLATE = """
@@ -120,7 +119,7 @@
}
)
topic = f"{kafka_prefix}.origin_visit"
- value = value_to_kafka({"origin": origin_foobar["url"]})
+ value = value_to_kafka({"origin": origin_foobar["url"], "type": "git"})
producer.produce(topic=topic, key=b"bogus-origin-visit", value=value)
journal_objects_config = JOURNAL_OBJECTS_CONFIG_TEMPLATE.format(
diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py
--- a/swh/search/tests/test_journal_client.py
+++ b/swh/search/tests/test_journal_client.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -32,9 +32,9 @@
worker_fn = functools.partial(process_journal_objects, search=search_mock,)
- worker_fn({"origin_visit": [{"origin": {"url": "http://foobar.baz"},}]})
+ worker_fn({"origin_visit": [{"origin": "http://foobar.baz", "type": "git"},]})
search_mock.origin_update.assert_called_once_with(
- [{"url": "http://foobar.baz"},]
+ [{"url": "http://foobar.baz", "visit_types": ["git"]},]
)
diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
--- a/swh/search/tests/test_search.py
+++ b/swh/search/tests/test_search.py
@@ -1,8 +1,10 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from itertools import combinations
+
from hypothesis import given, settings, strategies
from swh.core.api.classes import stream_results
@@ -101,6 +103,71 @@
assert actual_page.next_page_token is None
assert actual_page.results == [origin_foobar_baz]
+ def test_origin_visit_types_search(self):
+ origins = [
+ {"url": "http://foobar.baz", "visit_types": ["git"]},
+ {"url": "http://barbaz.qux", "visit_types": ["svn"]},
+ {"url": "http://qux.quux", "visit_types": ["hg"]},
+ ]
+
+ self.search.origin_update(origins)
+ self.search.flush()
+
+ for origin in origins:
+ actual_page = self.search.origin_search(
+ url_pattern="http", visit_types=origin["visit_types"]
+ )
+ assert actual_page.next_page_token is None
+ results = [r["url"] for r in actual_page.results]
+ expected_results = [origin["url"]]
+ assert sorted(results) == sorted(expected_results)
+
+ actual_page = self.search.origin_search(url_pattern="http", visit_types=None)
+ assert actual_page.next_page_token is None
+ results = [r["url"] for r in actual_page.results]
+ expected_results = [origin["url"] for origin in origins]
+ assert sorted(results) == sorted(expected_results)
+
+ def test_origin_with_multiple_visit_types_search(self):
+ origins = [
+ {"url": "http://foobar.baz", "visit_types": ["git", "svn"]},
+ {"url": "http://foobar.baz", "visit_types": ["hg"]},
+ ]
+
+ self.search.origin_update(origins)
+ self.search.flush()
+
+ for visit_type in ("hg", "svn", "git"):
+ actual_page = self.search.origin_search(
+ url_pattern="http", visit_types=[visit_type]
+ )
+ assert actual_page.next_page_token is None
+ results = [r["url"] for r in actual_page.results]
+ expected_results = [origins[0]["url"]]
+ assert sorted(results) == sorted(expected_results)
+
+ def test_origins_with_multiple_visit_types_search(self):
+ origins = [
+ {"url": "http://foobar.baz", "visit_types": ["git", "hg", "svn"]},
+ {"url": "http://barbaz.qux", "visit_types": ["hg", "svn", "git"]},
+ {"url": "http://qux.quux", "visit_types": ["hg", "git", "svn"]},
+ ]
+
+ self.search.origin_update(origins)
+ self.search.flush()
+
+ visit_types = ["hg", "svn", "git"]
+
+ for i in range(len(visit_types)):
+ for types in combinations(visit_types, r=i + 1):
+ actual_page = self.search.origin_search(
+ url_pattern="http", visit_types=list(types)
+ )
+ assert actual_page.next_page_token is None
+ results = [r["url"] for r in actual_page.results]
+ expected_results = [origin["url"] for origin in origins]
+ assert sorted(results) == sorted(expected_results)
+
def test_origin_intrinsic_metadata_description(self):
origin1_nothin = {"url": "http://origin1"}
origin2_foobar = {"url": "http://origin2"}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 1:37 PM (1 w, 4 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3232101
Attached To
D5064: Enable to filter searched origins by visit types
Event Timeline
Log In to Comment