diff --git a/MANIFEST.in b/MANIFEST.in index b13dc0f..81c908b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,6 @@ include Makefile include README include requirements.txt include requirements-swh.txt +include requirements-test.txt include version.txt diff --git a/PKG-INFO b/PKG-INFO index 333fe3a..08d1b5d 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.lister -Version: 0.0.9 +Version: 0.0.10 Summary: Software Heritage GitHub lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/debian/control b/debian/control index 367b702..ab4b458 100644 --- a/debian/control +++ b/debian/control @@ -1,27 +1,31 @@ Source: swh-lister Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-all, python3-dateutil, + python3-debian, python3-nose, + python3-requests-mock, python3-setuptools, python3-sqlalchemy (>= 1.0), python3-swh.core, python3-swh.scheduler (>= 0.0.14~), python3-swh.storage, python3-swh.storage.schemata, - python3-vcversioner + python3-testing.postgresql, + python3-vcversioner, + python3-xmltodict Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/source/swh-lister/ Package: python3-swh.lister Architecture: all Depends: python3-swh.scheduler (>= 0.0.14~), ${misc:Depends}, ${python3:Depends} Breaks: python3-swh.lister.github Replaces: python3-swh.lister.github Description: Software Heritage lister diff --git a/requirements-swh.txt b/requirements-swh.txt index 47175d4..c08589d 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,3 +1,3 @@ swh.core -swh.storage >= 0.0.76 -swh.scheduler[schemata] >= 0.0.14 +swh.storage[schemata] >= 0.0.76 +swh.scheduler >= 0.0.14 diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..3eb878a --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,3 @@ +nose +requests_mock +testing.postgresql diff --git a/requirements.txt b/requirements.txt index 9a284d8..4f3f588 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,6 @@ +SQLAlchemy arrow -nose +python_debian requests -requests_mock setuptools -SQLAlchemy -testing.postgresql xmltodict diff --git a/setup.py b/setup.py index e1d86ac..42e154f 100644 --- a/setup.py +++ b/setup.py @@ -1,30 +1,41 @@ #!/usr/bin/env python3 +import os + from setuptools import setup, find_packages -def parse_requirements(): +def parse_requirements(name=None): + if name: + reqf = 'requirements-%s.txt' % name + else: + reqf = 'requirements.txt' + requirements = [] - for reqf in ('requirements.txt', 'requirements-swh.txt'): - with open(reqf) as f: - for line in f.readlines(): - line = line.strip() - if not line or line.startswith('#'): - continue - requirements.append(line) + if not os.path.exists(reqf): + return requirements + + with open(reqf) as f: + for line in f.readlines(): + line = line.strip() + if not line or line.startswith('#'): + continue + requirements.append(line) return requirements setup( name='swh.lister', description='Software Heritage GitHub lister', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/DLSGH/', packages=find_packages(), scripts=['bin/ghlister'], - install_requires=parse_requirements(), + install_requires=parse_requirements() + parse_requirements('swh'), + test_requires=parse_requirements('test'), + test_suite='nose.collector', setup_requires=['vcversioner'], vcversioner={'version_module_paths': ['swh/lister/_version.py']}, include_package_data=True, ) diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO index 333fe3a..08d1b5d 100644 --- a/swh.lister.egg-info/PKG-INFO +++ b/swh.lister.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.lister -Version: 0.0.9 +Version: 0.0.10 Summary: Software Heritage GitHub lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.lister.egg-info/SOURCES.txt b/swh.lister.egg-info/SOURCES.txt index 91618ee..f8ca453 100644 --- a/swh.lister.egg-info/SOURCES.txt +++ b/swh.lister.egg-info/SOURCES.txt @@ -1,62 +1,72 @@ .gitignore ACKNOWLEDGEMENTS LICENSE MANIFEST.in Makefile README TODO requirements-swh.txt +requirements-test.txt requirements.txt setup.py version.txt bin/batch bin/ghlister bin/reset.sh bin/status debian/changelog debian/clean debian/compat debian/control debian/copyright debian/rules debian/source/format docs/.gitignore docs/Makefile docs/conf.py docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder etc/crontab sql/crawler.sql sql/pimp_db.sql swh/__init__.py swh.lister.egg-info/PKG-INFO swh.lister.egg-info/SOURCES.txt swh.lister.egg-info/dependency_links.txt swh.lister.egg-info/requires.txt swh.lister.egg-info/top_level.txt +swh/lister/__init__.py +swh/lister/_version.py +swh/lister/bitbucket/__init__.py swh/lister/bitbucket/lister.py swh/lister/bitbucket/models.py swh/lister/bitbucket/tasks.py +swh/lister/bitbucket/tests/__init__.py swh/lister/bitbucket/tests/api_empty_response.json swh/lister/bitbucket/tests/api_response.json swh/lister/bitbucket/tests/test_bb_lister.py +swh/lister/core/__init__.py swh/lister/core/abstractattribute.py swh/lister/core/db_utils.py swh/lister/core/indexing_lister.py swh/lister/core/lister_base.py swh/lister/core/lister_transports.py swh/lister/core/models.py swh/lister/core/tasks.py +swh/lister/core/tests/__init__.py swh/lister/core/tests/test_abstractattribute.py swh/lister/core/tests/test_lister.py swh/lister/core/tests/test_model.py +swh/lister/debian/__init__.py swh/lister/debian/lister.py swh/lister/debian/tasks.py swh/lister/debian/utils.py +swh/lister/github/__init__.py swh/lister/github/lister.py swh/lister/github/models.py swh/lister/github/tasks.py +swh/lister/github/tests/__init__.py swh/lister/github/tests/api_empty_response.json swh/lister/github/tests/api_response.json swh/lister/github/tests/test_gh_lister.py \ No newline at end of file diff --git a/swh.lister.egg-info/requires.txt b/swh.lister.egg-info/requires.txt index 1dd5d45..dfd633a 100644 --- a/swh.lister.egg-info/requires.txt +++ b/swh.lister.egg-info/requires.txt @@ -1,11 +1,9 @@ SQLAlchemy arrow -nose +python_debian requests -requests_mock setuptools swh.core -swh.scheduler[schemata]>=0.0.14 -swh.storage>=0.0.76 -testing.postgresql +swh.scheduler>=0.0.14 +swh.storage[schemata]>=0.0.76 xmltodict diff --git a/swh/lister/__init__.py b/swh/lister/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/_version.py b/swh/lister/_version.py new file mode 100644 index 0000000..9af018d --- /dev/null +++ b/swh/lister/_version.py @@ -0,0 +1,5 @@ + +# This file is automatically generated by setup.py. +__version__ = '0.0.10' +__sha__ = 'g9b58ecb' +__revision__ = 'g9b58ecb' diff --git a/swh/lister/bitbucket/__init__.py b/swh/lister/bitbucket/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/bitbucket/tests/__init__.py b/swh/lister/bitbucket/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/core/__init__.py b/swh/lister/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/core/tests/__init__.py b/swh/lister/core/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/core/tests/test_lister.py b/swh/lister/core/tests/test_lister.py index d158058..6bc0259 100644 --- a/swh/lister/core/tests/test_lister.py +++ b/swh/lister/core/tests/test_lister.py @@ -1,229 +1,231 @@ # Copyright (C) 2017 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import time from unittest import TestCase from unittest.mock import Mock, patch import requests_mock -import testing.postgresql +from testing.postgresql import Postgresql from nose.tools import istest from sqlalchemy import create_engine from swh.lister.core.abstractattribute import AbstractAttribute def noop(*args, **kwargs): pass @requests_mock.Mocker() class IndexingHttpListerTesterBase(abc.ABC): """Base testing class for subclasses of swh.lister.core.indexing_lister.SWHIndexingHttpLister. See swh.lister.github.tests.test_gh_lister for an example of how to customize for a specific listing service. """ Lister = AbstractAttribute('The lister class to test') test_re = AbstractAttribute('Compiled regex matching the server url. Must' ' capture the index value.') lister_subdir = AbstractAttribute('bitbucket, github, etc.') good_api_response_file = AbstractAttribute('Example good response body') bad_api_response_file = AbstractAttribute('Example bad response body') first_index = AbstractAttribute('First index in good_api_response') last_index = AbstractAttribute('Last index in good_api_response') entries_per_page = AbstractAttribute('Number of results in good response') # May need to override this if the headers are used for something def response_headers(self, request): return {} # May need to override this if the server uses non-standard rate limiting # method. # Please keep the requested retry delay reasonably low. def mock_rate_quota(self, n, request, context): self.rate_limit += 1 context.status_code = 429 context.headers['Retry-After'] = '1' return '{"error":"dummy"}' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.rate_limit = 1 self.response = None self.fl = None self.helper = None if self.__class__ != IndexingHttpListerTesterBase: self.run = TestCase.run.__get__(self, self.__class__) else: self.run = noop def request_index(self, request): m = self.test_re.search(request.path_url) if m and (len(m.groups()) > 0): return m.group(1) else: return None def mock_response(self, request, context): self.fl.reset_backoff() self.rate_limit = 1 context.status_code = 200 custom_headers = self.response_headers(request) context.headers.update(custom_headers) if self.request_index(request) == str(self.first_index): with open('swh/lister/%s/tests/%s' % (self.lister_subdir, self.good_api_response_file), - 'r') as r: + 'r', encoding='utf-8') as r: return r.read() else: with open('swh/lister/%s/tests/%s' % (self.lister_subdir, self.bad_api_response_file), - 'r') as r: + 'r', encoding='utf-8') as r: return r.read() def mock_limit_n_response(self, n, request, context): self.fl.reset_backoff() if self.rate_limit <= n: return self.mock_rate_quota(n, request, context) else: return self.mock_response(request, context) def mock_limit_once_response(self, request, context): return self.mock_limit_n_response(1, request, context) def mock_limit_twice_response(self, request, context): return self.mock_limit_n_response(2, request, context) def get_fl(self, override_config=None): if override_config or self.fl is None: with patch( 'swh.scheduler.backend.SchedulerBackend.reconnect', noop ): self.fl = self.Lister(lister_name='fakelister', api_baseurl='https://fakeurl', override_config=override_config) self.fl.INITIAL_BACKOFF = 1 self.fl.reset_backoff() return self.fl def get_api_response(self): fl = self.get_fl() if self.response is None: self.response = fl.safely_issue_request(self.first_index) return self.response @istest def test_is_within_bounds(self, http_mocker): fl = self.get_fl() self.assertFalse(fl.is_within_bounds(1, 2, 3)) self.assertTrue(fl.is_within_bounds(2, 1, 3)) self.assertTrue(fl.is_within_bounds(1, 1, 1)) self.assertTrue(fl.is_within_bounds(1, None, None)) self.assertTrue(fl.is_within_bounds(1, None, 2)) self.assertTrue(fl.is_within_bounds(1, 0, None)) self.assertTrue(fl.is_within_bounds("b", "a", "c")) self.assertFalse(fl.is_within_bounds("a", "b", "c")) self.assertTrue(fl.is_within_bounds("a", None, "c")) self.assertTrue(fl.is_within_bounds("a", None, None)) self.assertTrue(fl.is_within_bounds("b", "a", None)) self.assertFalse(fl.is_within_bounds("a", "b", None)) self.assertTrue(fl.is_within_bounds("aa:02", "aa:01", "aa:03")) self.assertFalse(fl.is_within_bounds("aa:12", None, "aa:03")) with self.assertRaises(TypeError): fl.is_within_bounds(1.0, "b", None) with self.assertRaises(TypeError): fl.is_within_bounds("A:B", "A::B", None) @istest def test_api_request(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_limit_twice_response) with patch.object(time, 'sleep', wraps=time.sleep) as sleepmock: self.get_api_response() self.assertEqual(sleepmock.call_count, 2) @istest def test_repos_list(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) li = self.get_fl().transport_response_simplified( self.get_api_response() ) self.assertIsInstance(li, list) self.assertEqual(len(li), self.entries_per_page) @istest def test_model_map(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() li = fl.transport_response_simplified(self.get_api_response()) di = li[0] self.assertIsInstance(di, dict) pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith('_')] for k in pubs: if k not in ['last_seen', 'task_id', 'origin_id']: self.assertIn(k, di) def disable_storage_and_scheduler(self, fl): fl.create_missing_origins_and_tasks = Mock(return_value=None) def disable_db(self, fl): fl.winnow_models = Mock(return_value=[]) fl.db_inject_repo = Mock(return_value=fl.MODEL()) fl.disable_deleted_repo_tasks = Mock(return_value=None) @istest def test_fetch_none_nodb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() self.disable_storage_and_scheduler(fl) self.disable_db(fl) fl.run(min_index=1, max_index=1) # stores no results @istest def test_fetch_one_nodb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() self.disable_storage_and_scheduler(fl) self.disable_db(fl) fl.run(min_index=self.first_index, max_index=self.first_index) @istest def test_fetch_multiple_pages_nodb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() self.disable_storage_and_scheduler(fl) self.disable_db(fl) fl.run(min_index=self.first_index) def init_db(self, db, model): engine = create_engine(db.url()) model.metadata.create_all(engine) @istest def test_fetch_multiple_pages_yesdb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) - db = testing.postgresql.Postgresql() + initdb_args = Postgresql.DEFAULT_SETTINGS['initdb_args'] + initdb_args = ' '.join([initdb_args, '-E UTF-8']) + db = Postgresql(initdb_args=initdb_args) fl = self.get_fl(override_config={'lister_db_url': db.url()}) self.init_db(db, fl.MODEL) self.disable_storage_and_scheduler(fl) fl.run(min_index=self.first_index) self.assertEqual(fl.db_last_index(), self.last_index) partitions = fl.db_partition_indices(5) self.assertGreater(len(partitions), 0) for k in partitions: self.assertLessEqual(len(k), 5) self.assertGreater(len(k), 0) diff --git a/swh/lister/debian/__init__.py b/swh/lister/debian/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/debian/tasks.py b/swh/lister/debian/tasks.py index 39aa6a2..cdac167 100644 --- a/swh/lister/debian/tasks.py +++ b/swh/lister/debian/tasks.py @@ -1,16 +1,18 @@ # Copyright (C) 2017 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.lister.core.tasks import ListerTaskBase from .lister import DebianLister class DebianListerTask(ListerTaskBase): + task_queue = 'swh_lister_debian' + def new_lister(self): return DebianLister() def run_task(self, distribution): lister = self.new_lister() return lister.run(distribution) diff --git a/swh/lister/github/__init__.py b/swh/lister/github/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/github/tests/__init__.py b/swh/lister/github/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/version.txt b/version.txt index a83902a..88856ca 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.9-0-g458a9e6 \ No newline at end of file +v0.0.10-0-g9b58ecb \ No newline at end of file