import json import requests from pprint import pprint def print_requested_urls(response): results = json.loads(response.text) for result in results['hits']['hits']: print(result['_source']['swh_atoms_u']) return results query = { 'size' : 1000, 'sort': ['@timestamp'], 'query': { 'bool':{ 'must': [ { 'query_string': { 'query':'hostname:moma AND systemd_unit:"gunicorn-swh-webapp.service"', 'analyze_wildcard': True, 'default_field':'*' } }, { 'match_phrase': { 'hostname': { 'query':'moma' } } }, { 'match_phrase': { 'systemd_unit': { 'query':'gunicorn-swh-webapp.service' } } }, { 'query_string': { # get requests to origin endpoints, discarding pid resolving one 'query':'origin AND NOT swh', 'default_field' : 'swh_atoms_u' } }, { 'range':{ '@timestamp': { 'gte':1536487687063, 'lte':1568023687063, 'format':'epoch_millis' } } } ] } } } es_url = 'http://esnode1.internal.softwareheritage.org:9200/systemlogs-*/_search?scroll=1m' response = requests.post(es_url, json=query) results = print_requested_urls(response) while results['hits']['hits']: query = { 'scroll': '1m', 'scroll_id': results['_scroll_id'] } es_url = 'http://esnode1.internal.softwareheritage.org:9200/_search/scroll' response = requests.post(es_url, json=query) results = print_requested_urls(response)