Page MenuHomeSoftware Heritage

D5064.id18076.diff
No OneTemporary

D5064.id18076.diff

diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py
--- a/swh/search/elasticsearch.py
+++ b/swh/search/elasticsearch.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -21,7 +21,7 @@
# Whitelist fields to be saved in Elasticsearch
res = {"url": origin.pop("url")}
- for field_name in ("intrinsic_metadata", "has_visits"):
+ for field_name in ("intrinsic_metadata", "has_visits", "visit_types"):
if field_name in origin:
res[field_name] = origin.pop(field_name)
@@ -101,6 +101,7 @@
}
},
},
+ "visit_types": {"type": "keyword"},
# used to filter out origins that were never visited
"has_visits": {"type": "boolean",},
"intrinsic_metadata": {
@@ -125,16 +126,38 @@
documents_with_sha1 = (
(origin_identifier(document), document) for document in documents
)
+ update_script = """
+ for (int i = 0; i < params.visit_types.length; ++i) {
+ if (!ctx._source.visit_types.contains(params.visit_types[i])) {
+ ctx._source.visit_types.add(params.visit_types[i]);
+ }
+ }
+ if (!ctx._source.containsKey("has_visits")) {
+ ctx._source.has_visits = params.has_visits;
+ } else {
+ ctx._source.has_visits = ctx._source.has_visits || params.has_visits;
+ }
+ """
+
actions = [
{
"_op_type": "update",
"_id": sha1,
"_index": self.origin_index,
- "doc": {**document, "sha1": sha1,},
- "doc_as_upsert": True,
+ "scripted_upsert": True,
+ "upsert": {**document, "sha1": sha1,},
+ "script": {
+ "source": update_script,
+ "lang": "painless",
+ "params": {
+ "visit_types": document.get("visit_types", []),
+ "has_visits": document.get("has_visits", False),
+ },
+ },
}
for (sha1, document) in documents_with_sha1
]
+
bulk(self._backend, actions, index=self.origin_index)
def origin_dump(self) -> Iterator[model.Origin]:
@@ -150,6 +173,7 @@
url_pattern: Optional[str] = None,
metadata_pattern: Optional[str] = None,
with_visit: bool = False,
+ visit_types: Optional[List[str]] = None,
page_token: Optional[str] = None,
limit: int = 50,
) -> PagedResult[Dict[str, Any]]:
@@ -204,6 +228,9 @@
if with_visit:
query_clauses.append({"term": {"has_visits": True,}})
+ if visit_types is not None:
+ query_clauses.append({"terms": {"visit_types": visit_types}})
+
body = {
"query": {"bool": {"must": query_clauses,}},
"sort": [{"_score": "desc"}, {"sha1": "asc"},],
diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py
--- a/swh/search/in_memory.py
+++ b/swh/search/in_memory.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -39,7 +39,16 @@
id_ = origin_identifier(document)
if "url" in document:
document["_url_tokens"] = set(self._url_splitter.split(document["url"]))
+ if "visit_types" in self._origins[id_]:
+ document = dict(document)
+ document["visit_types"] = list(
+ set(
+ self._origins[id_]["visit_types"]
+ + document.get("visit_types", [])
+ )
+ )
self._origins[id_].update(document)
+
if id_ not in self._origin_ids:
self._origin_ids.append(id_)
@@ -49,6 +58,7 @@
url_pattern: Optional[str] = None,
metadata_pattern: Optional[str] = None,
with_visit: bool = False,
+ visit_types: Optional[List[str]] = None,
page_token: Optional[str] = None,
limit: int = 50,
) -> PagedResult[Dict[str, Any]]:
@@ -90,6 +100,12 @@
if with_visit:
hits = filter(lambda o: o.get("has_visits"), hits)
+ if visit_types is not None:
+ visit_types_set = set(visit_types)
+ hits = filter(
+ lambda o: not (visit_types_set - set(o.get("visit_types", []))), hits
+ )
+
start_at_index = int(page_token) if page_token else 0
origins = [
diff --git a/swh/search/interface.py b/swh/search/interface.py
--- a/swh/search/interface.py
+++ b/swh/search/interface.py
@@ -1,9 +1,9 @@
-# Copyright (C) 2020 The Software Heritage developers
+# Copyright (C) 2020-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from typing import Any, Dict, Iterable, Optional, TypeVar
+from typing import Any, Dict, Iterable, List, Optional, TypeVar
from swh.core.api import remote_api_endpoint
from swh.core.api.classes import PagedResult as CorePagedResult
@@ -42,6 +42,7 @@
url_pattern: Optional[str] = None,
metadata_pattern: Optional[str] = None,
with_visit: bool = False,
+ visit_types: Optional[List[str]] = None,
page_token: Optional[str] = None,
limit: int = 50,
) -> PagedResult[Dict[str, Any]]:
@@ -51,6 +52,8 @@
url_pattern: Part of the URL to search for
with_visit: Whether origins with no visit are to be
filtered out
+ visit_types: Only origins with given visit types (e.g. git, svn, pypi)
+ will be returned
page_token: Opaque value used for pagination
limit: number of results to return
diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py
--- a/swh/search/journal_client.py
+++ b/swh/search/journal_client.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2019 The Software Heritage developers
+# Copyright (C) 2018-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -48,6 +48,7 @@
if isinstance(visit["origin"], str)
else visit["origin"]["url"]
),
+ "visit_types": [visit["type"]],
}
for visit in visits
]
diff --git a/swh/search/tests/test_cli.py b/swh/search/tests/test_cli.py
--- a/swh/search/tests/test_cli.py
+++ b/swh/search/tests/test_cli.py
@@ -19,10 +19,9 @@
CLI_CONFIG = """
search:
cls: elasticsearch
- args:
- hosts:
- - '%(elasticsearch_host)s'
- index_prefix: test
+ hosts:
+ - '%(elasticsearch_host)s'
+ index_prefix: test
"""
JOURNAL_OBJECTS_CONFIG_TEMPLATE = """
@@ -120,7 +119,7 @@
}
)
topic = f"{kafka_prefix}.origin_visit"
- value = value_to_kafka({"origin": origin_foobar["url"]})
+ value = value_to_kafka({"origin": origin_foobar["url"], "type": "git"})
producer.produce(topic=topic, key=b"bogus-origin-visit", value=value)
journal_objects_config = JOURNAL_OBJECTS_CONFIG_TEMPLATE.format(
diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py
--- a/swh/search/tests/test_journal_client.py
+++ b/swh/search/tests/test_journal_client.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -32,9 +32,9 @@
worker_fn = functools.partial(process_journal_objects, search=search_mock,)
- worker_fn({"origin_visit": [{"origin": {"url": "http://foobar.baz"},}]})
+ worker_fn({"origin_visit": [{"origin": "http://foobar.baz", "type": "git"},]})
search_mock.origin_update.assert_called_once_with(
- [{"url": "http://foobar.baz"},]
+ [{"url": "http://foobar.baz", "visit_types": ["git"]},]
)
diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py
--- a/swh/search/tests/test_search.py
+++ b/swh/search/tests/test_search.py
@@ -1,8 +1,10 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from itertools import combinations
+
from hypothesis import given, settings, strategies
from swh.core.api.classes import stream_results
@@ -101,6 +103,71 @@
assert actual_page.next_page_token is None
assert actual_page.results == [origin_foobar_baz]
+ def test_origin_visit_types_search(self):
+ origins = [
+ {"url": "http://foobar.baz", "visit_types": ["git"]},
+ {"url": "http://barbaz.qux", "visit_types": ["svn"]},
+ {"url": "http://qux.quux", "visit_types": ["hg"]},
+ ]
+
+ self.search.origin_update(origins)
+ self.search.flush()
+
+ for origin in origins:
+ actual_page = self.search.origin_search(
+ url_pattern="http", visit_types=origin["visit_types"]
+ )
+ assert actual_page.next_page_token is None
+ results = [r["url"] for r in actual_page.results]
+ expected_results = [origin["url"]]
+ assert sorted(results) == sorted(expected_results)
+
+ actual_page = self.search.origin_search(url_pattern="http", visit_types=None)
+ assert actual_page.next_page_token is None
+ results = [r["url"] for r in actual_page.results]
+ expected_results = [origin["url"] for origin in origins]
+ assert sorted(results) == sorted(expected_results)
+
+ def test_origin_with_multiple_visit_types_search(self):
+ origins = [
+ {"url": "http://foobar.baz", "visit_types": ["git", "svn"]},
+ {"url": "http://foobar.baz", "visit_types": ["hg"]},
+ ]
+
+ self.search.origin_update(origins)
+ self.search.flush()
+
+ for visit_type in ("hg", "svn", "git"):
+ actual_page = self.search.origin_search(
+ url_pattern="http", visit_types=[visit_type]
+ )
+ assert actual_page.next_page_token is None
+ results = [r["url"] for r in actual_page.results]
+ expected_results = [origins[0]["url"]]
+ assert sorted(results) == sorted(expected_results)
+
+ def test_origins_with_multiple_visit_types_search(self):
+ origins = [
+ {"url": "http://foobar.baz", "visit_types": ["git", "hg", "svn"]},
+ {"url": "http://barbaz.qux", "visit_types": ["hg", "svn", "git"]},
+ {"url": "http://qux.quux", "visit_types": ["hg", "git", "svn"]},
+ ]
+
+ self.search.origin_update(origins)
+ self.search.flush()
+
+ visit_types = ["hg", "svn", "git"]
+
+ for i in range(len(visit_types)):
+ for types in combinations(visit_types, r=i + 1):
+ actual_page = self.search.origin_search(
+ url_pattern="http", visit_types=list(types)
+ )
+ assert actual_page.next_page_token is None
+ results = [r["url"] for r in actual_page.results]
+ expected_results = [origin["url"] for origin in origins]
+ assert sorted(results) == sorted(expected_results)
+
def test_origin_intrinsic_metadata_description(self):
origin1_nothin = {"url": "http://origin1"}
origin2_foobar = {"url": "http://origin2"}

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 1:37 PM (1 w, 8 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3232101

Event Timeline