diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py index a5cf91b..5877c8d 100644 --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -1,85 +1,83 @@ # Copyright (C) 2017-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import iso8601 from datetime import datetime - from urllib import parse from swh.lister.bitbucket.models import BitBucketModel from swh.lister.core.indexing_lister import IndexingHttpLister logger = logging.getLogger(__name__) - DEFAULT_BITBUCKET_PAGE = 10 class BitBucketLister(IndexingHttpLister): PATH_TEMPLATE = '/repositories?after=%s' MODEL = BitBucketModel LISTER_NAME = 'bitbucket' instance = 'bitbucket' - default_min_bound = datetime.utcfromtimestamp(0).isoformat() + default_min_bound = datetime.utcfromtimestamp(0) def __init__(self, api_baseurl, override_config=None, per_page=100): super().__init__( api_baseurl=api_baseurl, override_config=override_config) if per_page != DEFAULT_BITBUCKET_PAGE: self.PATH_TEMPLATE = '%s&pagelen=%s' % ( self.PATH_TEMPLATE, per_page) # to stay consistent with prior behavior (20 * 10 repositories then) self.flush_packet_db = int( (self.flush_packet_db * DEFAULT_BITBUCKET_PAGE) / per_page) def get_model_from_repo(self, repo): return { 'uid': repo['uuid'], - 'indexable': repo['created_on'], + 'indexable': iso8601.parse_date(repo['created_on']), 'name': repo['name'], 'full_name': repo['full_name'], 'html_url': repo['links']['html']['href'], 'origin_url': repo['links']['clone'][0]['href'], 'origin_type': repo['scm'], } def get_next_target_from_response(self, response): + """This will read the 'next' link from the api response if any + and return it as a datetime. + + Args: + reponse (Response): requests' response from api call + + Returns: + next date as a datetime + + """ body = response.json() - if 'next' in body: - return parse.unquote(body['next'].split('after=')[1]) + next_ = body.get('next') + if next_ is not None: + next_ = parse.urlparse(next_) + return iso8601.parse_date(parse.parse_qs(next_.query)['after'][0]) def transport_response_simplified(self, response): repos = response.json()['values'] return [self.get_model_from_repo(repo) for repo in repos] def request_uri(self, identifier): + identifier = parse.quote(identifier.isoformat()) return super().request_uri(identifier or '1970-01-01') def is_within_bounds(self, inner, lower=None, upper=None): - # values are expected to be str dates - try: - inner = iso8601.parse_date(inner) - if lower: - lower = iso8601.parse_date(lower) - if upper: - upper = iso8601.parse_date(upper) - if lower is None and upper is None: - return True - elif lower is None: - ret = inner <= upper - elif upper is None: - ret = inner >= lower - else: - ret = lower <= inner <= upper - except Exception as e: - logger.error(str(e) + ': %s, %s, %s', - ('inner=%s%s' % (type(inner), inner)), - ('lower=%s%s' % (type(lower), lower)), - ('upper=%s%s' % (type(upper), upper))) - raise - + # values are expected to be datetimes + if lower is None and upper is None: + ret = True + elif lower is None: + ret = inner <= upper + elif upper is None: + ret = inner >= lower + else: + ret = lower <= inner <= upper return ret diff --git a/swh/lister/bitbucket/models.py b/swh/lister/bitbucket/models.py index 053ae0b..d299b5b 100644 --- a/swh/lister/bitbucket/models.py +++ b/swh/lister/bitbucket/models.py @@ -1,15 +1,15 @@ # Copyright (C) 2017-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from sqlalchemy import Column, String +from sqlalchemy import Column, String, DateTime from swh.lister.core.models import IndexingModelBase class BitBucketModel(IndexingModelBase): """a BitBucket repository""" __tablename__ = 'bitbucket_repo' uid = Column(String, primary_key=True) - indexable = Column(String, index=True) + indexable = Column(DateTime(timezone=True), index=True) diff --git a/swh/lister/bitbucket/tests/test_bb_lister.py b/swh/lister/bitbucket/tests/test_bb_lister.py index 3db69dc..e13df23 100644 --- a/swh/lister/bitbucket/tests/test_bb_lister.py +++ b/swh/lister/bitbucket/tests/test_bb_lister.py @@ -1,29 +1,63 @@ -# Copyright (C) 2017-2018 the Software Heritage developers +# Copyright (C) 2017-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import unittest +from datetime import timedelta + +from urllib.parse import unquote + +import iso8601 +import requests_mock + from swh.lister.bitbucket.lister import BitBucketLister from swh.lister.core.tests.test_lister import HttpListerTester +def convert_type(req_index): + """Convert the req_index to its right type according to the model's + "indexable" column. + + """ + return iso8601.parse_date(unquote(req_index)) + + class BitBucketListerTester(HttpListerTester, unittest.TestCase): Lister = BitBucketLister test_re = re.compile(r'/repositories\?after=([^?&]+)') lister_subdir = 'bitbucket' good_api_response_file = 'api_response.json' bad_api_response_file = 'api_empty_response.json' - first_index = '2008-07-12T07:44:01.476818+00:00' - last_index = '2008-07-19T06:16:43.044743+00:00' + first_index = convert_type('2008-07-12T07:44:01.476818+00:00') + last_index = convert_type('2008-07-19T06:16:43.044743+00:00') entries_per_page = 10 + convert_type = staticmethod(convert_type) + + @requests_mock.Mocker() + def test_fetch_none_nodb(self, http_mocker): + """Overridden because index is not an integer nor a string + + """ + http_mocker.get(self.test_re, text=self.mock_response) + fl = self.get_fl() + + self.disable_scheduler(fl) + self.disable_db(fl) + + # stores no results + fl.run(min_bound=self.first_index - timedelta(days=3), + max_bound=self.first_index) def test_is_within_bounds(self): fl = self.get_fl() self.assertTrue(fl.is_within_bounds( - '2008-07-15', self.first_index, self.last_index)) + iso8601.parse_date('2008-07-15'), + self.first_index, self.last_index)) self.assertFalse(fl.is_within_bounds( - '2008-07-20', self.first_index, self.last_index)) + iso8601.parse_date('2008-07-20'), + self.first_index, self.last_index)) self.assertFalse(fl.is_within_bounds( - '2008-07-11', self.first_index, self.last_index)) + iso8601.parse_date('2008-07-11'), + self.first_index, self.last_index)) diff --git a/swh/lister/core/tests/test_lister.py b/swh/lister/core/tests/test_lister.py index 03588d9..5b4d666 100644 --- a/swh/lister/core/tests/test_lister.py +++ b/swh/lister/core/tests/test_lister.py @@ -1,233 +1,239 @@ # Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import time from unittest import TestCase from unittest.mock import Mock, patch import requests_mock from sqlalchemy import create_engine from testing.postgresql import Postgresql from swh.lister.core.abstractattribute import AbstractAttribute def noop(*args, **kwargs): pass @requests_mock.Mocker() class HttpListerTesterBase(abc.ABC): """Base testing class for subclasses of swh.lister.core.indexing_lister.IndexingHttpLister. swh.lister.core.page_by_page_lister.PageByPageHttpLister See swh.lister.github.tests.test_gh_lister for an example of how to customize for a specific listing service. """ Lister = AbstractAttribute('The lister class to test') test_re = AbstractAttribute('Compiled regex matching the server url. Must' ' capture the index value.') lister_subdir = AbstractAttribute('bitbucket, github, etc.') good_api_response_file = AbstractAttribute('Example good response body') bad_api_response_file = AbstractAttribute('Example bad response body') first_index = AbstractAttribute('First index in good_api_response') entries_per_page = AbstractAttribute('Number of results in good response') LISTER_NAME = 'fake-lister' + convert_type = str + """static method used to convert the "request_index" to its right type (for + indexing listers for example, this is in accordance with the model's + "indexable" column). + + """ # May need to override this if the headers are used for something def response_headers(self, request): return {} # May need to override this if the server uses non-standard rate limiting # method. # Please keep the requested retry delay reasonably low. def mock_rate_quota(self, n, request, context): self.rate_limit += 1 context.status_code = 429 context.headers['Retry-After'] = '1' return '{"error":"dummy"}' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.rate_limit = 1 self.response = None self.fl = None self.helper = None if self.__class__ != HttpListerTesterBase: self.run = TestCase.run.__get__(self, self.__class__) else: self.run = noop def request_index(self, request): m = self.test_re.search(request.path_url) if m and (len(m.groups()) > 0): - return m.group(1) + return self.convert_type(m.group(1)) def mock_response(self, request, context): self.fl.reset_backoff() self.rate_limit = 1 context.status_code = 200 custom_headers = self.response_headers(request) context.headers.update(custom_headers) req_index = self.request_index(request) - if req_index == str(self.first_index): + if req_index == self.first_index: response_file = self.good_api_response_file else: response_file = self.bad_api_response_file with open('swh/lister/%s/tests/%s' % (self.lister_subdir, response_file), 'r', encoding='utf-8') as r: return r.read() def mock_limit_n_response(self, n, request, context): self.fl.reset_backoff() if self.rate_limit <= n: return self.mock_rate_quota(n, request, context) else: return self.mock_response(request, context) def mock_limit_once_response(self, request, context): return self.mock_limit_n_response(1, request, context) def mock_limit_twice_response(self, request, context): return self.mock_limit_n_response(2, request, context) def get_fl(self, override_config=None): """Retrieve an instance of fake lister (fl). """ if override_config or self.fl is None: self.fl = self.Lister(api_baseurl='https://fakeurl', override_config=override_config) self.fl.INITIAL_BACKOFF = 1 self.fl.reset_backoff() return self.fl def get_api_response(self): fl = self.get_fl() if self.response is None: self.response = fl.safely_issue_request(self.first_index) return self.response def test_is_within_bounds(self, http_mocker): fl = self.get_fl() self.assertFalse(fl.is_within_bounds(1, 2, 3)) self.assertTrue(fl.is_within_bounds(2, 1, 3)) self.assertTrue(fl.is_within_bounds(1, 1, 1)) self.assertTrue(fl.is_within_bounds(1, None, None)) self.assertTrue(fl.is_within_bounds(1, None, 2)) self.assertTrue(fl.is_within_bounds(1, 0, None)) self.assertTrue(fl.is_within_bounds("b", "a", "c")) self.assertFalse(fl.is_within_bounds("a", "b", "c")) self.assertTrue(fl.is_within_bounds("a", None, "c")) self.assertTrue(fl.is_within_bounds("a", None, None)) self.assertTrue(fl.is_within_bounds("b", "a", None)) self.assertFalse(fl.is_within_bounds("a", "b", None)) self.assertTrue(fl.is_within_bounds("aa:02", "aa:01", "aa:03")) self.assertFalse(fl.is_within_bounds("aa:12", None, "aa:03")) with self.assertRaises(TypeError): fl.is_within_bounds(1.0, "b", None) with self.assertRaises(TypeError): fl.is_within_bounds("A:B", "A::B", None) def test_api_request(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_limit_twice_response) with patch.object(time, 'sleep', wraps=time.sleep) as sleepmock: self.get_api_response() self.assertEqual(sleepmock.call_count, 2) def test_repos_list(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) li = self.get_fl().transport_response_simplified( self.get_api_response() ) self.assertIsInstance(li, list) self.assertEqual(len(li), self.entries_per_page) def test_model_map(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() li = fl.transport_response_simplified(self.get_api_response()) di = li[0] self.assertIsInstance(di, dict) pubs = [k for k in vars(fl.MODEL).keys() if not k.startswith('_')] for k in pubs: if k not in ['last_seen', 'task_id', 'id']: self.assertIn(k, di) def disable_scheduler(self, fl): fl.schedule_missing_tasks = Mock(return_value=None) def disable_db(self, fl): fl.winnow_models = Mock(return_value=[]) fl.db_inject_repo = Mock(return_value=fl.MODEL()) fl.disable_deleted_repo_tasks = Mock(return_value=None) def test_fetch_none_nodb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() self.disable_scheduler(fl) self.disable_db(fl) fl.run(min_bound=1, max_bound=1) # stores no results def test_fetch_one_nodb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() self.disable_scheduler(fl) self.disable_db(fl) fl.run(min_bound=self.first_index, max_bound=self.first_index) def test_fetch_multiple_pages_nodb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) fl = self.get_fl() self.disable_scheduler(fl) self.disable_db(fl) fl.run(min_bound=self.first_index) def init_db(self, db, model): engine = create_engine(db.url()) model.metadata.create_all(engine) class HttpListerTester(HttpListerTesterBase, abc.ABC): last_index = AbstractAttribute('Last index in good_api_response') @requests_mock.Mocker() def test_fetch_multiple_pages_yesdb(self, http_mocker): http_mocker.get(self.test_re, text=self.mock_response) initdb_args = Postgresql.DEFAULT_SETTINGS['initdb_args'] initdb_args = ' '.join([initdb_args, '-E UTF-8']) db = Postgresql(initdb_args=initdb_args) fl = self.get_fl(override_config={ 'lister': { 'cls': 'local', 'args': {'db': db.url()} } }) self.init_db(db, fl.MODEL) self.disable_scheduler(fl) fl.run(min_bound=self.first_index) self.assertEqual(fl.db_last_index(), self.last_index) partitions = fl.db_partition_indices(5) self.assertGreater(len(partitions), 0) for k in partitions: self.assertLessEqual(len(k), 5) self.assertGreater(len(k), 0) diff --git a/swh/lister/github/tests/test_gh_lister.py b/swh/lister/github/tests/test_gh_lister.py index 768a247..ea0ff4b 100644 --- a/swh/lister/github/tests/test_gh_lister.py +++ b/swh/lister/github/tests/test_gh_lister.py @@ -1,46 +1,47 @@ -# Copyright (C) 2017-2018 the Software Heritage developers +# Copyright (C) 2017-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import unittest from datetime import datetime, timedelta from swh.lister.core.tests.test_lister import HttpListerTester from swh.lister.github.lister import GitHubLister class GitHubListerTester(HttpListerTester, unittest.TestCase): Lister = GitHubLister test_re = re.compile(r'/repositories\?since=([^?&]+)') lister_subdir = 'github' good_api_response_file = 'api_response.json' bad_api_response_file = 'api_empty_response.json' first_index = 26 last_index = 368 entries_per_page = 100 + convert_type = int def response_headers(self, request): headers = {'X-RateLimit-Remaining': '1'} - if self.request_index(request) == str(self.first_index): + if self.request_index(request) == self.first_index: headers.update({ 'Link': ';' ' rel="next",' ';' ' rel="first"' }) else: headers.update({ 'Link': ';' ' rel="first"' }) return headers def mock_rate_quota(self, n, request, context): self.rate_limit += 1 context.status_code = 403 context.headers['X-RateLimit-Remaining'] = '0' one_second = int((datetime.now() + timedelta(seconds=1.5)).timestamp()) context.headers['X-RateLimit-Reset'] = str(one_second) return '{"error":"dummy"}' diff --git a/swh/lister/gitlab/tests/test_gitlab_lister.py b/swh/lister/gitlab/tests/test_gitlab_lister.py index cbe0f4d..9e43816 100644 --- a/swh/lister/gitlab/tests/test_gitlab_lister.py +++ b/swh/lister/gitlab/tests/test_gitlab_lister.py @@ -1,37 +1,38 @@ -# Copyright (C) 2017-2018 the Software Heritage developers +# Copyright (C) 2017-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import unittest from datetime import datetime, timedelta from swh.lister.core.tests.test_lister import HttpListerTesterBase from swh.lister.gitlab.lister import GitLabLister class GitLabListerTester(HttpListerTesterBase, unittest.TestCase): Lister = GitLabLister test_re = re.compile(r'^.*/projects.*page=(\d+).*') lister_subdir = 'gitlab' good_api_response_file = 'api_response.json' bad_api_response_file = 'api_empty_response.json' first_index = 1 entries_per_page = 10 + convert_type = int def response_headers(self, request): headers = {'RateLimit-Remaining': '1'} - if self.request_index(request) == str(self.first_index): + if self.request_index(request) == self.first_index: headers.update({ 'x-next-page': '3', }) return headers def mock_rate_quota(self, n, request, context): self.rate_limit += 1 context.status_code = 403 context.headers['RateLimit-Remaining'] = '0' one_second = int((datetime.now() + timedelta(seconds=1.5)).timestamp()) context.headers['RateLimit-Reset'] = str(one_second) return '{"error":"dummy"}' diff --git a/swh/lister/npm/tests/test_npm_lister.py b/swh/lister/npm/tests/test_npm_lister.py index 73aa3f4..01fe595 100644 --- a/swh/lister/npm/tests/test_npm_lister.py +++ b/swh/lister/npm/tests/test_npm_lister.py @@ -1,44 +1,44 @@ -# Copyright (C) 2018 the Software Heritage developers +# Copyright (C) 2018-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import requests_mock import unittest from swh.lister.core.tests.test_lister import HttpListerTesterBase from swh.lister.npm.lister import NpmLister, NpmIncrementalLister class NpmListerTester(HttpListerTesterBase, unittest.TestCase): Lister = NpmLister test_re = re.compile(r'^.*/_all_docs\?startkey=%22(.+)%22.*') lister_subdir = 'npm' good_api_response_file = 'api_response.json' bad_api_response_file = 'api_empty_response.json' first_index = 'jquery' entries_per_page = 100 @requests_mock.Mocker() def test_is_within_bounds(self, http_mocker): # disable this test from HttpListerTesterBase as # it can not succeed for the npm lister due to the # overriding of the string_pattern_check method pass class NpmIncrementalListerTester(HttpListerTesterBase, unittest.TestCase): Lister = NpmIncrementalLister test_re = re.compile(r'^.*/_changes\?since=([0-9]+).*') lister_subdir = 'npm' good_api_response_file = 'api_inc_response.json' bad_api_response_file = 'api_inc_empty_response.json' - first_index = 6920642 + first_index = '6920642' entries_per_page = 100 @requests_mock.Mocker() def test_is_within_bounds(self, http_mocker): # disable this test from HttpListerTesterBase as # it can not succeed for the npm lister due to the # overriding of the string_pattern_check method pass diff --git a/swh/lister/phabricator/tests/test_lister.py b/swh/lister/phabricator/tests/test_lister.py index 6f565ab..e4d6cce 100644 --- a/swh/lister/phabricator/tests/test_lister.py +++ b/swh/lister/phabricator/tests/test_lister.py @@ -1,55 +1,56 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import json import unittest from swh.lister.core.tests.test_lister import HttpListerTester from swh.lister.phabricator.lister import PhabricatorLister from swh.lister.phabricator.lister import get_repo_url class PhabricatorListerTester(HttpListerTester, unittest.TestCase): Lister = PhabricatorLister test_re = re.compile(r'\&after=([^?&]+)') lister_subdir = 'phabricator' good_api_response_file = 'api_response.json' good_api_response_undefined_protocol = 'api_response_undefined_'\ 'protocol.json' bad_api_response_file = 'api_empty_response.json' first_index = 1 last_index = 12 entries_per_page = 10 + convert_type = int def get_fl(self, override_config=None): """(Override) Retrieve an instance of fake lister (fl). """ if override_config or self.fl is None: self.fl = self.Lister(forge_url='https://fakeurl', instance='fake', api_token='a-1', override_config=override_config) self.fl.INITIAL_BACKOFF = 1 self.fl.reset_backoff() return self.fl def test_get_repo_url(self): f = open('swh/lister/%s/tests/%s' % (self.lister_subdir, self.good_api_response_file)) api_response = json.load(f) repos = api_response['result']['data'] for repo in repos: self.assertEqual( 'https://forge.softwareheritage.org/source/%s.git' % (repo['fields']['shortName']), get_repo_url(repo['attachments']['uris']['uris'])) f = open('swh/lister/%s/tests/%s' % (self.lister_subdir, self.good_api_response_undefined_protocol)) repo = json.load(f) self.assertEqual( 'https://svn.blender.org/svnroot/bf-blender/', get_repo_url(repo['attachments']['uris']['uris']))