Changeset View
Changeset View
Standalone View
Standalone View
swh/storage/algos/origin.py
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
def iter_origins(storage, origin_from=1, origin_to=None, batch_size=10000): | def iter_origins(storage, origin_from=1, origin_to=None, batch_size=10000): | ||||
"""Iterates over all origins in the storage. | """Iterates over all origins in the storage. | ||||
Args: | Args: | ||||
storage: the storage object used for queries. | storage: the storage object used for queries. | ||||
batch_size: number of origins per query | batch_size: number of origins per query | ||||
Yields: | Yields: | ||||
dict: the origin dictionary with the keys: | dict: the origin dictionary with the keys: | ||||
- id: origin's id | |||||
- type: origin's type | - type: origin's type | ||||
- url: origin's url | - url: origin's url | ||||
""" | """ | ||||
start = origin_from | start = origin_from | ||||
while True: | while True: | ||||
if origin_to: | if origin_to: | ||||
origin_count = min(origin_to - start, batch_size) | origin_count = min(origin_to - start, batch_size) | ||||
else: | else: | ||||
origin_count = batch_size | origin_count = batch_size | ||||
origins = list(storage.origin_get_range( | origins = list(storage.origin_get_range( | ||||
origin_from=start, origin_count=origin_count)) | origin_from=start, origin_count=origin_count)) | ||||
if not origins: | if not origins: | ||||
break | break | ||||
start = origins[-1]['id'] + 1 | start = origins[-1]['id'] + 1 | ||||
yield from origins | for origin in origins: | ||||
del origin['id'] | |||||
yield origin | |||||
if origin_to and start > origin_to: | if origin_to and start > origin_to: | ||||
break | break |