diff --git a/swh/storage/algos/origin.py b/swh/storage/algos/origin.py new file mode 100644 --- /dev/null +++ b/swh/storage/algos/origin.py @@ -0,0 +1,27 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def iter_origins(storage, batch_size=10000): + """Iterates over all origins in the storage. + + Args: + storage: the storage object used for queries. + batch_size: number of origins per query + Yields: + dict: the origin dictionary with the keys: + + - id: origin's id + - type: origin's type + - url: origin's url + """ + start = 1 + while True: + origins = list(storage.origin_get_range( + origin_from=start, origin_count=batch_size)) + if not origins: + break + start = origins[-1]['id']+1 + yield from origins diff --git a/swh/storage/tests/algos/test_origin.py b/swh/storage/tests/algos/test_origin.py new file mode 100644 --- /dev/null +++ b/swh/storage/tests/algos/test_origin.py @@ -0,0 +1,19 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.storage.in_memory import Storage +from swh.storage.algos.origin import iter_origins + + +def test_iter_origins(): + storage = Storage() + origins = storage.origin_add([ + {'type': 'foo', 'url': 'bar'}, + {'type': 'baz', 'url': 'qux'}, + {'type': 'quux', 'url': 'quuz'}, + ]) + assert list(iter_origins(storage)) == origins + assert list(iter_origins(storage, batch_size=1)) == origins + assert list(iter_origins(storage, batch_size=2)) == origins