diff --git a/swh/lister/core/lister_base.py b/swh/lister/core/lister_base.py --- a/swh/lister/core/lister_base.py +++ b/swh/lister/core/lister_base.py @@ -150,6 +150,25 @@ """ return models_list + def do_additional_checks(self, models_list): + """Execute some additional checks on the model list. For example, to + check for existing repositories in the db. + + MAY BE OVERRIDDEN if an intermediate Lister class needs to + check some more the results before injection. + + Checks are fine by default, returns the models_list as is by default. + + Args: + models_list: list of dicts returned by + transport_response_simplified. + + Returns: + models_list with entries if checks ok, False otherwise + + """ + return models_list + def is_within_bounds(self, inner, lower=None, upper=None): """See if a sortable value is inside the range [lower,upper]. @@ -453,18 +472,23 @@ [self.task_dict(m['origin_type'], m['origin_url'])] )[0]['id'] - def ingest_data(self, identifier): + def ingest_data(self, identifier, checks=False): """The core data fetch sequence. Request server endpoint. Simplify and filter response list of repositories. Inject repo information into local db. Queue loader tasks for linked repositories. Args: identifier: Resource identifier. + checks (bool): Additional checks required """ # Request (partial?) list of repositories info response = self.safely_issue_request(identifier) models_list = self.transport_response_simplified(response) models_list = self.filter_before_inject(models_list) + if checks: + models_list = self.do_additional_checks(models_list) + if not models_list: + return response, [] # inject into local db injected = self.inject_repo_data_into_db(models_list) # queue workers diff --git a/swh/lister/core/paging_lister.py b/swh/lister/core/paging_lister.py --- a/swh/lister/core/paging_lister.py +++ b/swh/lister/core/paging_lister.py @@ -79,14 +79,18 @@ # You probably don't need to override anything below this line. - def check_existence(self, injected_repos): - """Given a list of injected repos, check if we already have them. + def do_additional_checks(self, models_list): + """Potentially check for existence of repositories in models_list. - Attribute 'instance' variable is assumed to be populated. + This will be called only if check_existence is flipped on in + the run method below. """ - # FIXME: Implement the check - return False + for m in models_list: + sql_repo = self.db_query_equal('uid', m['uid']) + if sql_repo: + return False + return models_list def run(self, min_bound=None, max_bound=None, check_existence=False): """Main entry function. Sequentially fetches repository data from the @@ -111,16 +115,17 @@ self.min_page = min_bound self.max_page = max_bound - already_seen = False while self.is_within_bounds(page, self.min_page, self.max_page): logging.info('listing repos starting at %s' % page) - response, injected_repos = self.ingest_data(page) - next_page = self.get_next_target_from_response(response) + response, injected_repos = self.ingest_data(page, + checks=check_existence) + if not injected_repos: + logging.info('Repositories already seen, stopping') + break - if check_existence: - already_seen = self.check_existence(injected_repos) + next_page = self.get_next_target_from_response(response) # termination condition @@ -128,9 +133,6 @@ logging.info('stopping after page %s, no next link found' % page) break - elif already_seen: - logging.info('Repositories already seen, stopping') - break else: page = next_page