diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py index 45b573c..5da11b1 100644 --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -1,79 +1,79 @@ # Copyright (C) 2017-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import iso8601 -from datetime import datetime +from datetime import datetime, timezone from urllib import parse from swh.lister.bitbucket.models import BitBucketModel from swh.lister.core.indexing_lister import IndexingHttpLister logger = logging.getLogger(__name__) class BitBucketLister(IndexingHttpLister): PATH_TEMPLATE = '/repositories?after=%s' MODEL = BitBucketModel LISTER_NAME = 'bitbucket' DEFAULT_URL = 'https://api.bitbucket.org/2.0' instance = 'bitbucket' - default_min_bound = datetime.utcfromtimestamp(0) + default_min_bound = datetime.fromtimestamp(0, timezone.utc) def __init__(self, url=None, override_config=None, per_page=100): super().__init__(url=url, override_config=override_config) per_page = self.config.get('per_page', per_page) self.PATH_TEMPLATE = '%s&pagelen=%s' % ( self.PATH_TEMPLATE, per_page) def get_model_from_repo(self, repo): return { 'uid': repo['uuid'], 'indexable': iso8601.parse_date(repo['created_on']), 'name': repo['name'], 'full_name': repo['full_name'], 'html_url': repo['links']['html']['href'], 'origin_url': repo['links']['clone'][0]['href'], 'origin_type': repo['scm'], } def get_next_target_from_response(self, response): """This will read the 'next' link from the api response if any and return it as a datetime. Args: response (Response): requests' response from api call Returns: next date as a datetime """ body = response.json() next_ = body.get('next') if next_ is not None: next_ = parse.urlparse(next_) return iso8601.parse_date(parse.parse_qs(next_.query)['after'][0]) def transport_response_simplified(self, response): repos = response.json()['values'] return [self.get_model_from_repo(repo) for repo in repos] def request_uri(self, identifier): identifier = parse.quote(identifier.isoformat()) return super().request_uri(identifier or '1970-01-01') def is_within_bounds(self, inner, lower=None, upper=None): # values are expected to be datetimes if lower is None and upper is None: ret = True elif lower is None: ret = inner <= upper elif upper is None: ret = inner >= lower else: ret = lower <= inner <= upper return ret diff --git a/swh/lister/bitbucket/tests/test_bb_lister.py b/swh/lister/bitbucket/tests/test_bb_lister.py index e13df23..72ee097 100644 --- a/swh/lister/bitbucket/tests/test_bb_lister.py +++ b/swh/lister/bitbucket/tests/test_bb_lister.py @@ -1,63 +1,73 @@ # Copyright (C) 2017-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import unittest from datetime import timedelta from urllib.parse import unquote import iso8601 import requests_mock from swh.lister.bitbucket.lister import BitBucketLister from swh.lister.core.tests.test_lister import HttpListerTester def convert_type(req_index): """Convert the req_index to its right type according to the model's "indexable" column. """ return iso8601.parse_date(unquote(req_index)) class BitBucketListerTester(HttpListerTester, unittest.TestCase): Lister = BitBucketLister test_re = re.compile(r'/repositories\?after=([^?&]+)') lister_subdir = 'bitbucket' good_api_response_file = 'api_response.json' bad_api_response_file = 'api_empty_response.json' first_index = convert_type('2008-07-12T07:44:01.476818+00:00') last_index = convert_type('2008-07-19T06:16:43.044743+00:00') entries_per_page = 10 convert_type = staticmethod(convert_type) + def request_index(self, request): + """(Override) This is needed to emulate the listing bootstrap + when no min_bound is provided to run + """ + m = self.test_re.search(request.path_url) + idx = convert_type(m.group(1)) + if idx == self.Lister.default_min_bound: + idx = self.first_index + return idx + @requests_mock.Mocker() def test_fetch_none_nodb(self, http_mocker): """Overridden because index is not an integer nor a string """ http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() self.disable_scheduler(fl) self.disable_db(fl) # stores no results fl.run(min_bound=self.first_index - timedelta(days=3), max_bound=self.first_index) def test_is_within_bounds(self): fl = self.get_fl() self.assertTrue(fl.is_within_bounds( iso8601.parse_date('2008-07-15'), self.first_index, self.last_index)) self.assertFalse(fl.is_within_bounds( iso8601.parse_date('2008-07-20'), self.first_index, self.last_index)) self.assertFalse(fl.is_within_bounds( iso8601.parse_date('2008-07-11'), self.first_index, self.last_index)) diff --git a/swh/lister/core/tests/test_lister.py b/swh/lister/core/tests/test_lister.py index 7caff6a..08138cf 100644 --- a/swh/lister/core/tests/test_lister.py +++ b/swh/lister/core/tests/test_lister.py @@ -1,375 +1,386 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import datetime import time from unittest import TestCase from unittest.mock import Mock, patch import requests_mock from sqlalchemy import create_engine from swh.lister.core.abstractattribute import AbstractAttribute from swh.lister.tests.test_utils import init_db def noop(*args, **kwargs): pass class HttpListerTesterBase(abc.ABC): """Testing base class for listers. This contains methods for both :class:`HttpSimpleListerTester` and :class:`HttpListerTester`. See :class:`swh.lister.gitlab.tests.test_lister` for an example of how to customize for a specific listing service. """ Lister = AbstractAttribute('The lister class to test') lister_subdir = AbstractAttribute('bitbucket, github, etc.') good_api_response_file = AbstractAttribute('Example good response body') LISTER_NAME = 'fake-lister' # May need to override this if the headers are used for something def response_headers(self, request): return {} # May need to override this if the server uses non-standard rate limiting # method. # Please keep the requested retry delay reasonably low. def mock_rate_quota(self, n, request, context): self.rate_limit += 1 context.status_code = 429 context.headers['Retry-After'] = '1' return '{"error":"dummy"}' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.rate_limit = 1 self.response = None self.fl = None self.helper = None self.scheduler_tasks = [] if self.__class__ != HttpListerTesterBase: self.run = TestCase.run.__get__(self, self.__class__) else: self.run = noop def mock_limit_n_response(self, n, request, context): self.fl.reset_backoff() if self.rate_limit <= n: return self.mock_rate_quota(n, request, context) else: return self.mock_response(request, context) def mock_limit_twice_response(self, request, context): return self.mock_limit_n_response(2, request, context) def get_api_response(self, identifier): fl = self.get_fl() if self.response is None: self.response = fl.safely_issue_request(identifier) return self.response def get_fl(self, override_config=None): """Retrieve an instance of fake lister (fl). """ if override_config or self.fl is None: self.fl = self.Lister(url='https://fakeurl', override_config=override_config) self.fl.INITIAL_BACKOFF = 1 self.fl.reset_backoff() self.scheduler_tasks = [] return self.fl def disable_scheduler(self, fl): fl.schedule_missing_tasks = Mock(return_value=None) def mock_scheduler(self, fl): def _create_tasks(tasks): task_id = 0 current_nb_tasks = len(self.scheduler_tasks) if current_nb_tasks > 0: task_id = self.scheduler_tasks[-1]['id'] + 1 for task in tasks: scheduler_task = dict(task) scheduler_task.update({ 'status': 'next_run_not_scheduled', 'retries_left': 0, 'priority': None, 'id': task_id, 'current_interval': datetime.timedelta(days=64) }) self.scheduler_tasks.append(scheduler_task) task_id = task_id + 1 return self.scheduler_tasks[current_nb_tasks:] def _disable_tasks(task_ids): for task_id in task_ids: self.scheduler_tasks[task_id]['status'] = 'disabled' fl.scheduler.create_tasks = Mock(wraps=_create_tasks) fl.scheduler.disable_tasks = Mock(wraps=_disable_tasks) def disable_db(self, fl): fl.winnow_models = Mock(return_value=[]) fl.db_inject_repo = Mock(return_value=fl.MODEL()) fl.disable_deleted_repo_tasks = Mock(return_value=None) def init_db(self, db, model): engine = create_engine(db.url()) model.metadata.create_all(engine) @requests_mock.Mocker() def test_is_within_bounds(self, http_mocker): fl = self.get_fl() self.assertFalse(fl.is_within_bounds(1, 2, 3)) self.assertTrue(fl.is_within_bounds(2, 1, 3)) self.assertTrue(fl.is_within_bounds(1, 1, 1)) self.assertTrue(fl.is_within_bounds(1, None, None)) self.assertTrue(fl.is_within_bounds(1, None, 2)) self.assertTrue(fl.is_within_bounds(1, 0, None)) self.assertTrue(fl.is_within_bounds("b", "a", "c")) self.assertFalse(fl.is_within_bounds("a", "b", "c")) self.assertTrue(fl.is_within_bounds("a", None, "c")) self.assertTrue(fl.is_within_bounds("a", None, None)) self.assertTrue(fl.is_within_bounds("b", "a", None)) self.assertFalse(fl.is_within_bounds("a", "b", None)) self.assertTrue(fl.is_within_bounds("aa:02", "aa:01", "aa:03")) self.assertFalse(fl.is_within_bounds("aa:12", None, "aa:03")) with self.assertRaises(TypeError): fl.is_within_bounds(1.0, "b", None) with self.assertRaises(TypeError): fl.is_within_bounds("A:B", "A::B", None) class HttpListerTester(HttpListerTesterBase, abc.ABC): """Base testing class for subclass of :class:`swh.lister.core.indexing_lister.IndexingHttpLister` See :class:`swh.lister.github.tests.test_gh_lister` for an example of how to customize for a specific listing service. """ last_index = AbstractAttribute('Last index in good_api_response') first_index = AbstractAttribute('First index in good_api_response') bad_api_response_file = AbstractAttribute('Example bad response body') entries_per_page = AbstractAttribute('Number of results in good response') test_re = AbstractAttribute('Compiled regex matching the server url. Must' ' capture the index value.') convert_type = str """static method used to convert the "request_index" to its right type (for indexing listers for example, this is in accordance with the model's "indexable" column). """ def mock_response(self, request, context): self.fl.reset_backoff() self.rate_limit = 1 context.status_code = 200 custom_headers = self.response_headers(request) context.headers.update(custom_headers) req_index = self.request_index(request) if req_index == self.first_index: response_file = self.good_api_response_file else: response_file = self.bad_api_response_file with open('swh/lister/%s/tests/%s' % (self.lister_subdir, response_file), 'r', encoding='utf-8') as r: return r.read() def request_index(self, request): m = self.test_re.search(request.path_url) if m and (len(m.groups()) > 0): return self.convert_type(m.group(1)) def create_fl_with_db(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) db = init_db() fl = self.get_fl(override_config={ 'lister': { 'cls': 'local', 'args': {'db': db.url()} } }) fl.db = db self.init_db(db, fl.MODEL) self.mock_scheduler(fl) return fl + @requests_mock.Mocker() + def test_fetch_no_bounds_yesdb(self, http_mocker): + fl = self.create_fl_with_db(http_mocker) + + fl.run() + + self.assertEqual(fl.db_last_index(), self.last_index) + ingested_repos = list(fl.db_query_range(self.first_index, + self.last_index)) + self.assertEqual(len(ingested_repos), self.entries_per_page) + @requests_mock.Mocker() def test_fetch_multiple_pages_yesdb(self, http_mocker): fl = self.create_fl_with_db(http_mocker) fl.run(min_bound=self.first_index) self.assertEqual(fl.db_last_index(), self.last_index) partitions = fl.db_partition_indices(5) self.assertGreater(len(partitions), 0) for k in partitions: self.assertLessEqual(len(k), 5) self.assertGreater(len(k), 0) @requests_mock.Mocker() def test_fetch_none_nodb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() self.disable_scheduler(fl) self.disable_db(fl) fl.run(min_bound=1, max_bound=1) # stores no results # FIXME: Determine what this method tries to test and add checks to # actually test @requests_mock.Mocker() def test_fetch_one_nodb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() self.disable_scheduler(fl) self.disable_db(fl) fl.run(min_bound=self.first_index, max_bound=self.first_index) # FIXME: Determine what this method tries to test and add checks to # actually test @requests_mock.Mocker() def test_fetch_multiple_pages_nodb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() self.disable_scheduler(fl) self.disable_db(fl) fl.run(min_bound=self.first_index) # FIXME: Determine what this method tries to test and add checks to # actually test @requests_mock.Mocker() def test_repos_list(self, http_mocker): """Test the number of repos listed by the lister """ http_mocker.get(self.test_re, text=self.mock_response) li = self.get_fl().transport_response_simplified( self.get_api_response(self.first_index) ) self.assertIsInstance(li, list) self.assertEqual(len(li), self.entries_per_page) @requests_mock.Mocker() def test_model_map(self, http_mocker): """Check if all the keys of model are present in the model created by the `transport_response_simplified` """ http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() li = fl.transport_response_simplified( self.get_api_response(self.first_index)) di = li[0] self.assertIsInstance(di, dict) pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith('_')] for k in pubs: if k not in ['last_seen', 'task_id', 'id']: self.assertIn(k, di) @requests_mock.Mocker() def test_api_request(self, http_mocker): """Test API request for rate limit handling """ http_mocker.get(self.test_re, text=self.mock_limit_twice_response) with patch.object(time, 'sleep', wraps=time.sleep) as sleepmock: self.get_api_response(self.first_index) self.assertEqual(sleepmock.call_count, 2) class HttpSimpleListerTester(HttpListerTesterBase, abc.ABC): """Base testing class for subclass of :class:`swh.lister.core.simple)_lister.SimpleLister` See :class:`swh.lister.pypi.tests.test_lister` for an example of how to customize for a specific listing service. """ entries = AbstractAttribute('Number of results in good response') PAGE = AbstractAttribute("The server api's unique page to retrieve and " "parse for information") def get_fl(self, override_config=None): """Retrieve an instance of fake lister (fl). """ if override_config or self.fl is None: self.fl = self.Lister( override_config=override_config) self.fl.INITIAL_BACKOFF = 1 self.fl.reset_backoff() return self.fl def mock_response(self, request, context): self.fl.reset_backoff() self.rate_limit = 1 context.status_code = 200 custom_headers = self.response_headers(request) context.headers.update(custom_headers) response_file = self.good_api_response_file with open('swh/lister/%s/tests/%s' % (self.lister_subdir, response_file), 'r', encoding='utf-8') as r: return r.read() @requests_mock.Mocker() def test_api_request(self, http_mocker): """Test API request for rate limit handling """ http_mocker.get(self.PAGE, text=self.mock_limit_twice_response) with patch.object(time, 'sleep', wraps=time.sleep) as sleepmock: self.get_api_response(0) self.assertEqual(sleepmock.call_count, 2) @requests_mock.Mocker() def test_model_map(self, http_mocker): """Check if all the keys of model are present in the model created by the `transport_response_simplified` """ http_mocker.get(self.PAGE, text=self.mock_response) fl = self.get_fl() li = fl.list_packages(self.get_api_response(0)) li = fl.transport_response_simplified(li) di = li[0] self.assertIsInstance(di, dict) pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith('_')] for k in pubs: if k not in ['last_seen', 'task_id', 'id']: self.assertIn(k, di) @requests_mock.Mocker() def test_repos_list(self, http_mocker): """Test the number of packages listed by the lister """ http_mocker.get(self.PAGE, text=self.mock_response) li = self.get_fl().list_packages( self.get_api_response(0) ) self.assertIsInstance(li, list) self.assertEqual(len(li), self.entries) diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py index 173eb9d..4c0ed69 100644 --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -1,50 +1,51 @@ # Copyright (C) 2017-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import time from swh.lister.core.indexing_lister import IndexingHttpLister from swh.lister.github.models import GitHubModel class GitHubLister(IndexingHttpLister): PATH_TEMPLATE = '/repositories?since=%d' MODEL = GitHubModel DEFAULT_URL = 'https://api.github.com' API_URL_INDEX_RE = re.compile(r'^.*/repositories\?since=(\d+)') LISTER_NAME = 'github' instance = 'github' # There is only 1 instance of such lister + default_min_bound = 0 def get_model_from_repo(self, repo): return { 'uid': repo['id'], 'indexable': repo['id'], 'name': repo['name'], 'full_name': repo['full_name'], 'html_url': repo['html_url'], 'origin_url': repo['html_url'], 'origin_type': 'git', 'fork': repo['fork'], } def transport_quota_check(self, response): reqs_remaining = int(response.headers['X-RateLimit-Remaining']) if response.status_code == 403 and reqs_remaining == 0: reset_at = int(response.headers['X-RateLimit-Reset']) delay = min(reset_at - time.time(), 3600) return True, delay return False, 0 def get_next_target_from_response(self, response): if 'next' in response.links: next_url = response.links['next']['url'] return int(self.API_URL_INDEX_RE.match(next_url).group(1)) def transport_response_simplified(self, response): repos = response.json() return [self.get_model_from_repo(repo) for repo in repos] def request_headers(self): return {'Accept': 'application/vnd.github.v3+json'} diff --git a/swh/lister/github/tests/test_gh_lister.py b/swh/lister/github/tests/test_gh_lister.py index ea0ff4b..07fa007 100644 --- a/swh/lister/github/tests/test_gh_lister.py +++ b/swh/lister/github/tests/test_gh_lister.py @@ -1,47 +1,47 @@ # Copyright (C) 2017-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import unittest from datetime import datetime, timedelta from swh.lister.core.tests.test_lister import HttpListerTester from swh.lister.github.lister import GitHubLister class GitHubListerTester(HttpListerTester, unittest.TestCase): Lister = GitHubLister test_re = re.compile(r'/repositories\?since=([^?&]+)') lister_subdir = 'github' good_api_response_file = 'api_response.json' bad_api_response_file = 'api_empty_response.json' - first_index = 26 + first_index = 0 last_index = 368 entries_per_page = 100 convert_type = int def response_headers(self, request): headers = {'X-RateLimit-Remaining': '1'} if self.request_index(request) == self.first_index: headers.update({ 'Link': ';' ' rel="next",' ';' ' rel="first"' }) else: headers.update({ 'Link': ';' ' rel="first"' }) return headers def mock_rate_quota(self, n, request, context): self.rate_limit += 1 context.status_code = 403 context.headers['X-RateLimit-Remaining'] = '0' one_second = int((datetime.now() + timedelta(seconds=1.5)).timestamp()) context.headers['X-RateLimit-Reset'] = str(one_second) return '{"error":"dummy"}' diff --git a/swh/lister/phabricator/tests/test_lister.py b/swh/lister/phabricator/tests/test_lister.py index 8568041..ca752a9 100644 --- a/swh/lister/phabricator/tests/test_lister.py +++ b/swh/lister/phabricator/tests/test_lister.py @@ -1,139 +1,128 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import json import unittest import requests_mock from swh.lister.core.tests.test_lister import HttpListerTester from swh.lister.phabricator.lister import PhabricatorLister from swh.lister.phabricator.lister import get_repo_url class PhabricatorListerTester(HttpListerTester, unittest.TestCase): Lister = PhabricatorLister # first request will have the after parameter empty test_re = re.compile(r'\&after=([^?&]*)') lister_subdir = 'phabricator' good_api_response_file = 'api_first_response.json' good_api_response_undefined_protocol = 'api_response_undefined_'\ 'protocol.json' bad_api_response_file = 'api_empty_response.json' # first_index must be retrieved through a bootstrap process for Phabricator first_index = None last_index = 12 entries_per_page = 10 convert_type = int def request_index(self, request): """(Override) This is needed to emulate the listing bootstrap when no min_bound is provided to run """ m = self.test_re.search(request.path_url) idx = m.group(1) if idx not in ('', 'None'): return int(idx) def get_fl(self, override_config=None): """(Override) Retrieve an instance of fake lister (fl). """ if override_config or self.fl is None: credentials = {'phabricator': {'fake': [ {'password': 'toto'} ]}} override_config = dict(credentials=credentials, **(override_config or {})) self.fl = self.Lister(url='https://fakeurl', instance='fake', override_config=override_config) self.fl.INITIAL_BACKOFF = 1 self.fl.reset_backoff() return self.fl def test_get_repo_url(self): f = open('swh/lister/%s/tests/%s' % (self.lister_subdir, self.good_api_response_file)) api_response = json.load(f) repos = api_response['result']['data'] for repo in repos: self.assertEqual( 'https://forge.softwareheritage.org/source/%s.git' % (repo['fields']['shortName']), get_repo_url(repo['attachments']['uris']['uris'])) f = open('swh/lister/%s/tests/%s' % (self.lister_subdir, self.good_api_response_undefined_protocol)) repo = json.load(f) self.assertEqual( 'https://svn.blender.org/svnroot/bf-blender/', get_repo_url(repo['attachments']['uris']['uris'])) - @requests_mock.Mocker() - def test_full_listing(self, http_mocker): - fl = self.create_fl_with_db(http_mocker) - - fl.run() - - self.assertEqual(fl.db_last_index(), self.last_index) - ingested_repos = list(fl.db_query_range(self.first_index, - self.last_index)) - self.assertEqual(len(ingested_repos), self.entries_per_page) - @requests_mock.Mocker() def test_scheduled_tasks(self, http_mocker): fl = self.create_fl_with_db(http_mocker) # process first page of repositories listing fl.run() # process second page of repositories listing prev_last_index = self.last_index self.first_index = self.last_index self.last_index = 23 self.good_api_response_file = 'api_next_response.json' fl.run(min_bound=prev_last_index) # check expected number of ingested repos and loading tasks ingested_repos = list(fl.db_query_range(0, self.last_index)) self.assertEqual(len(ingested_repos), len(self.scheduler_tasks)) self.assertEqual(len(ingested_repos), 2 * self.entries_per_page) # check tasks are not disabled for task in self.scheduler_tasks: self.assertTrue(task['status'] != 'disabled') @requests_mock.Mocker() def test_scheduled_tasks_multiple_instances(self, http_mocker): fl = self.create_fl_with_db(http_mocker) # list first Phabricator instance fl.run() fl.instance = 'other_fake' fl.config['credentials'] = { 'phabricator': { 'other_fake': [{ 'password': 'foo' }] } } # list second Phabricator instance hosting repositories having # same ids as those listed from the first instance self.good_api_response_file = 'api_first_response_other_instance.json' self.last_index = 13 fl.run() # check expected number of loading tasks self.assertEqual(len(self.scheduler_tasks), 2 * self.entries_per_page) # check tasks are not disabled for task in self.scheduler_tasks: self.assertTrue(task['status'] != 'disabled')