diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -20,7 +20,7 @@ res = { 'url': origin.pop('url') } - for field_name in ('intrinsic_metadata',): + for field_name in ('intrinsic_metadata', 'has_visits'): if field_name in origin: res[field_name] = origin.pop(field_name) return res @@ -63,6 +63,9 @@ } } }, + 'has_visits': { + 'type': 'boolean', + }, 'intrinsic_metadata': { 'type': 'nested', 'properties': { @@ -107,14 +110,16 @@ def origin_search( self, *, url_pattern: str = None, metadata_pattern: str = None, + with_visit: bool = False, scroll_token: str = None, count: int = 50 ) -> Dict[str, object]: """Searches for origins matching the `url_pattern`. Args: url_pattern (str): Part of thr URL to search for - scroll_token (str): `scroll_token` is an opaque value used for - pagination. + with_visit (bool): Whether origins with no visit are to be + filtered out + scroll_token (str): Opaque value used for pagination. count (int): number of results to return. Returns: @@ -159,10 +164,17 @@ 'At least one of url_pattern and metadata_pattern ' 'must be provided.') + if with_visit: + query_clauses.append({ + 'term': { + 'has_visits': True, + } + }) + body = { 'query': { 'bool': { - 'should': query_clauses, # TODO: must? + 'must': query_clauses, } }, 'size': count, diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -61,6 +61,7 @@ def origin_search( self, *, url_pattern: str = None, metadata_pattern: str = None, + with_visit: bool = False, scroll_token: str = None, count: int = 50 ) -> Dict[str, object]: matches = (self._origins[id_] for id_ in self._origin_ids) @@ -91,6 +92,9 @@ 'At least one of url_pattern and metadata_pattern ' 'must be provided.') + if with_visit: + matches = filter(lambda o: o.get('has_visits'), matches) + if scroll_token: scroll_token = msgpack.loads(base64.b64decode(scroll_token)) start_at_index = scroll_token[b'start_at_index'] diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -54,6 +54,34 @@ expected_results = ['http://barbaz.qux', 'http://qux.quux'] assert sorted(results) == sorted(expected_results) + def test_origin_with_visit(self): + self.search.origin_update([ + {'url': 'http://foobar.baz', 'has_visits': True}, + ]) + + results = self.search.origin_search( + url_pattern='foobar', with_visit=True) + assert results == {'scroll_token': None, 'results': [ + {'url': 'http://foobar.baz'}]} + + def test_origin_with_visit_added(self): + self.search.origin_update([ + {'url': 'http://foobar.baz'}, + ]) + + results = self.search.origin_search( + url_pattern='foobar', with_visit=True) + assert results == {'scroll_token': None, 'results': []} + + self.search.origin_update([ + {'url': 'http://foobar.baz', 'has_visits': True}, + ]) + + results = self.search.origin_search( + url_pattern='foobar', with_visit=True) + assert results == {'scroll_token': None, 'results': [ + {'url': 'http://foobar.baz'}]} + def test_origin_intrinsic_metadata_description(self): self.search.origin_update([ {