diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,3 +1,4 @@ Click dulwich pytest +pytz diff --git a/swh/model/tests/generate_testdata.py b/swh/model/tests/generate_testdata.py new file mode 100644 --- /dev/null +++ b/swh/model/tests/generate_testdata.py @@ -0,0 +1,66 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime +from pytz import all_timezones, timezone +from random import choice, randint, random, shuffle +from typing import List, Dict + +from swh.model.hashutil import MultiHash + + +PROTOCOLS = ['git', 'http', 'https', 'deb', 'svn', 'mock'] +DOMAINS = ['example.com', 'some.long.host.name', 'xn--n28h.tld'] +PATHS = ['', '/', '/stuff', '/stuff/', + '/path/to/resource', + '/path/with/anchor#id=42', + '/path/with/qargs?q=1&b'] +CONTENT_STATUS = ['visible', 'hidden', 'absent'] +MAX_DATE = 3e9 # around 2065 + + +def gen_all_origins(): + for protocol in PROTOCOLS: + for domain in DOMAINS: + for urlpath in PATHS: + yield {'url': '%s://%s%s' % (protocol, domain, urlpath)} + + +ORIGINS = list(gen_all_origins()) + + +def gen_origins(n: int = 100) -> List: + """Returns a list of n randomly generated origins suitable for using as + Storage.add_origin() argument. + + """ + origins = ORIGINS[:] + shuffle(origins) + return origins[:n] + + +def gen_content(): + size = randint(1, 10 * 1024) + data = bytes(randint(0, 255) for i in range(size)) + status = choice(CONTENT_STATUS) + h = MultiHash.from_data(data) + ctime = datetime.fromtimestamp( + random() * MAX_DATE, timezone(choice(all_timezones))) + content = {'data': data, + 'status': status, + 'length': size, + 'ctime': ctime, + **h.digest()} + if status == 'absent': + content['reason'] = 'why not' + content['data'] = b'' + return content + + +def gen_contents(n=20) -> List[Dict]: + """Returns a list of n randomly generated content objects (as dict) suitable + for using as Storage.content_add() argument. + """ + return [gen_content() for i in range(n)] diff --git a/swh/model/tests/test_generate_testdata.py b/swh/model/tests/test_generate_testdata.py new file mode 100644 --- /dev/null +++ b/swh/model/tests/test_generate_testdata.py @@ -0,0 +1,54 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from .generate_testdata import gen_contents, gen_origins, ORIGINS + +from swh.model.model import Origin, Content + + +def test_gen_origins_empty(): + origins = gen_origins(0) + assert not origins + + +def test_gen_origins_one(): + origins = gen_origins(1) + assert len(origins) == 1 + assert [Origin.from_dict(d) for d in origins] + + +def test_gen_origins_default(): + origins = gen_origins() + assert len(origins) == 100 + models = [Origin.from_dict(d).url for d in origins] + assert len(origins) == len(set(models)) + + +def test_gen_origins_max(): + nmax = len(ORIGINS) + origins = gen_origins(nmax+1) + assert len(origins) == nmax + models = {Origin.from_dict(d).url for d in origins} + # ensure we did not generate the same origin twice + assert len(origins) == len(models) + + +def test_gen_contents_empty(): + contents = gen_contents(0) + assert not contents + + +def test_gen_contents_one(): + contents = gen_contents(1) + assert len(contents) == 1 + assert [Content.from_dict(d) for d in contents] + + +def test_gen_contents_default(): + contents = gen_contents() + assert len(contents) == 20 + models = {Content.from_dict(d) for d in contents} + # ensure we did not generate the same content twice + assert len(contents) == len(models)