diff --git a/swh/search/elasticsearch.py b/swh/search/elasticsearch.py --- a/swh/search/elasticsearch.py +++ b/swh/search/elasticsearch.py @@ -20,7 +20,7 @@ res = { 'url': origin.pop('url') } - for field_name in ('intrinsic_metadata',): + for field_name in ('intrinsic_metadata', 'has_visits'): if field_name in origin: res[field_name] = origin.pop(field_name) return res @@ -63,6 +63,10 @@ } } }, + 'has_visits': { + 'type': 'boolean', + 'store': True, + }, 'intrinsic_metadata': { 'type': 'nested', 'properties': { @@ -107,14 +111,16 @@ def origin_search( self, *, url_pattern: str = None, metadata_pattern: str = None, + with_visit: bool = False, scroll_token: str = None, count: int = 50 ) -> Dict[str, object]: """Searches for origins matching the `url_pattern`. Args: url_pattern (str): Part of thr URL to search for - scroll_token (str): `scroll_token` is an opaque value used for - pagination. + with_visit (bool): Whether origins with no visit are to be + filtered out + scroll_token (str): Opaque value used for pagination. count (int): number of results to return. Returns: @@ -159,10 +165,17 @@ 'At least one of url_pattern and metadata_pattern ' 'must be provided.') + if with_visit: + query_clauses.append({ + 'term': { + 'has_visits': True, + } + }) + body = { 'query': { 'bool': { - 'should': query_clauses, # TODO: must? + 'must': query_clauses, } }, 'size': count, diff --git a/swh/search/in_memory.py b/swh/search/in_memory.py --- a/swh/search/in_memory.py +++ b/swh/search/in_memory.py @@ -61,6 +61,7 @@ def origin_search( self, *, url_pattern: str = None, metadata_pattern: str = None, + with_visit: bool = False, scroll_token: str = None, count: int = 50 ) -> Dict[str, object]: matches = (self._origins[id_] for id_ in self._origin_ids) @@ -91,6 +92,9 @@ 'At least one of url_pattern and metadata_pattern ' 'must be provided.') + if with_visit: + matches = filter(lambda o: o.get('has_visits'), matches) + if scroll_token: scroll_token = msgpack.loads(base64.b64decode(scroll_token)) start_at_index = scroll_token[b'start_at_index'] diff --git a/swh/search/journal_client.py b/swh/search/journal_client.py --- a/swh/search/journal_client.py +++ b/swh/search/journal_client.py @@ -6,9 +6,9 @@ import logging -MAX_ORIGINS_PER_TASK = 100 - -EXPECTED_MESSAGE_TYPES = {'origin', 'origin_intrinsic_metadata'} +EXPECTED_MESSAGE_TYPES = { + 'origin', 'origin_visit', 'origin_intrinsic_metadata', +} def process_journal_objects(messages, *, search): @@ -19,6 +19,9 @@ if 'origin' in messages: process_origins(messages['origin'], search) + if 'origin_visit' in messages: + process_origin_visits(messages['origin_visit'], search) + if 'origin_intrinsic_metadata' in messages: process_origin_intrinsic_metadata( messages['origin_intrinsic_metadata'], search) @@ -30,6 +33,18 @@ search.origin_update(origins) +def process_origin_visits(visits, search): + logging.debug('processing origin visits %r', visits) + + search.origin_update([ + { + 'url': visit['origin']['url'], + 'has_visits': True + } + for visit in visits + ]) + + def process_origin_intrinsic_metadata(origin_metadata, search): logging.debug('processing origin intrinsic_metadata %r', origin_metadata) diff --git a/swh/search/tests/test_journal_client.py b/swh/search/tests/test_journal_client.py --- a/swh/search/tests/test_journal_client.py +++ b/swh/search/tests/test_journal_client.py @@ -37,6 +37,23 @@ {'url': 'http://barbaz.qux'}, ]) + def test_origin_visit_from_journal(self): + search_mock = MagicMock() + + worker_fn = functools.partial( + process_journal_objects, + search=search_mock, + ) + + worker_fn({'origin_visit': [ + { + 'origin': {'url': 'http://foobar.baz'}, + } + ]}) + search_mock.origin_update.assert_called_once_with([ + {'url': 'http://foobar.baz', 'has_visits': True}, + ]) + def test_origin_metadata_from_journal(self): search_mock = MagicMock() diff --git a/swh/search/tests/test_search.py b/swh/search/tests/test_search.py --- a/swh/search/tests/test_search.py +++ b/swh/search/tests/test_search.py @@ -54,6 +54,34 @@ expected_results = ['http://barbaz.qux', 'http://qux.quux'] assert sorted(results) == sorted(expected_results) + def test_origin_with_visit(self): + self.search.origin_update([ + {'url': 'http://foobar.baz', 'has_visits': True}, + ]) + + results = self.search.origin_search( + url_pattern='foobar', with_visit=True) + assert results == {'scroll_token': None, 'results': [ + {'url': 'http://foobar.baz'}]} + + def test_origin_with_visit_added(self): + self.search.origin_update([ + {'url': 'http://foobar.baz'}, + ]) + + results = self.search.origin_search( + url_pattern='foobar', with_visit=True) + assert results == {'scroll_token': None, 'results': []} + + self.search.origin_update([ + {'url': 'http://foobar.baz', 'has_visits': True}, + ]) + + results = self.search.origin_search( + url_pattern='foobar', with_visit=True) + assert results == {'scroll_token': None, 'results': [ + {'url': 'http://foobar.baz'}]} + def test_origin_intrinsic_metadata_description(self): self.search.origin_update([ {