diff --git a/PKG-INFO b/PKG-INFO index d321e06..075dc56 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,237 +1,237 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 0.0.25 +Version: 0.0.26 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Description: swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.debian` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.pypi` - `swh.lister.npm` - `swh.lister.phabricator` - `swh.lister.cran` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`github`, `gitlab`, `debian`, `pypi`, `npm`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/ ~/.cache/swh/lister//` 2. create configuration file `~/.config/swh/lister_.yml` 3. Bootstrap the db instance schema ```lang=bash $ createdb lister- $ python3 -m swh.lister.cli --db-url postgres:///lister- ``` Note: This bootstraps a minimum data set needed for the lister to run. ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/lister_.yml`: ```lang=yml storage: cls: 'remote' args: url: 'http://localhost:5002/' scheduler: cls: 'remote' args: url: 'http://localhost:5008/' lister: cls: 'local' args: # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls db: 'postgresql:///lister-' credentials: [] cache_responses: True cache_dir: /home/user/.cache/swh/lister// ``` Note: This expects storage (5002) and scheduler (5008) services to run locally ## lister-github Once configured, you can execute a GitHub lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.github.tasks import range_github_lister logging.basicConfig(level=logging.DEBUG) range_github_lister(364, 365) ... ``` ## lister-gitlab Once configured, you can execute a GitLab lister using the instructions detailed in the `python3` scripts below: ```lang=python import logging from swh.lister.gitlab.tasks import range_gitlab_lister logging.basicConfig(level=logging.DEBUG) range_gitlab_lister(1, 2, { 'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import full_gitlab_relister logging.basicConfig(level=logging.DEBUG) full_gitlab_relister({ 'instance': '0xacab', 'api_baseurl': 'https://0xacab.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import incremental_gitlab_lister logging.basicConfig(level=logging.DEBUG) incremental_gitlab_lister({ 'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ## lister-debian Once configured, you can execute a Debian lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.debian.tasks import debian_lister logging.basicConfig(level=logging.DEBUG) debian_lister('Debian') ``` ## lister-pypi Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.pypi.tasks import pypi_lister logging.basicConfig(level=logging.DEBUG) pypi_lister() ``` ## lister-npm Once configured, you can execute a npm lister using the following instructions in a `python3` REPL: ```lang=python import logging from swh.lister.npm.tasks import npm_lister logging.basicConfig(level=logging.DEBUG) npm_lister() ``` ## lister-phabricator Once configured, you can execute a Phabricator lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.phabricator.tasks import incremental_phabricator_lister logging.basicConfig(level=logging.DEBUG) incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX') ``` ## lister-gnu Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.gnu.tasks import gnu_lister logging.basicConfig(level=logging.DEBUG) gnu_lister() ## lister-cran Once configured, you can execute a RCRAN lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.cran.tasks import cran_lister logging.basicConfig(level=logging.DEBUG) cran_lister() ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO index d321e06..075dc56 100644 --- a/swh.lister.egg-info/PKG-INFO +++ b/swh.lister.egg-info/PKG-INFO @@ -1,237 +1,237 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 0.0.25 +Version: 0.0.26 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Description: swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.debian` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.pypi` - `swh.lister.npm` - `swh.lister.phabricator` - `swh.lister.cran` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`github`, `gitlab`, `debian`, `pypi`, `npm`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/ ~/.cache/swh/lister//` 2. create configuration file `~/.config/swh/lister_.yml` 3. Bootstrap the db instance schema ```lang=bash $ createdb lister- $ python3 -m swh.lister.cli --db-url postgres:///lister- ``` Note: This bootstraps a minimum data set needed for the lister to run. ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/lister_.yml`: ```lang=yml storage: cls: 'remote' args: url: 'http://localhost:5002/' scheduler: cls: 'remote' args: url: 'http://localhost:5008/' lister: cls: 'local' args: # see http://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls db: 'postgresql:///lister-' credentials: [] cache_responses: True cache_dir: /home/user/.cache/swh/lister// ``` Note: This expects storage (5002) and scheduler (5008) services to run locally ## lister-github Once configured, you can execute a GitHub lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.github.tasks import range_github_lister logging.basicConfig(level=logging.DEBUG) range_github_lister(364, 365) ... ``` ## lister-gitlab Once configured, you can execute a GitLab lister using the instructions detailed in the `python3` scripts below: ```lang=python import logging from swh.lister.gitlab.tasks import range_gitlab_lister logging.basicConfig(level=logging.DEBUG) range_gitlab_lister(1, 2, { 'instance': 'debian', 'api_baseurl': 'https://salsa.debian.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import full_gitlab_relister logging.basicConfig(level=logging.DEBUG) full_gitlab_relister({ 'instance': '0xacab', 'api_baseurl': 'https://0xacab.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ```lang=python import logging from swh.lister.gitlab.tasks import incremental_gitlab_lister logging.basicConfig(level=logging.DEBUG) incremental_gitlab_lister({ 'instance': 'freedesktop.org', 'api_baseurl': 'https://gitlab.freedesktop.org/api/v4', 'sort': 'asc', 'per_page': 20 }) ``` ## lister-debian Once configured, you can execute a Debian lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.debian.tasks import debian_lister logging.basicConfig(level=logging.DEBUG) debian_lister('Debian') ``` ## lister-pypi Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.pypi.tasks import pypi_lister logging.basicConfig(level=logging.DEBUG) pypi_lister() ``` ## lister-npm Once configured, you can execute a npm lister using the following instructions in a `python3` REPL: ```lang=python import logging from swh.lister.npm.tasks import npm_lister logging.basicConfig(level=logging.DEBUG) npm_lister() ``` ## lister-phabricator Once configured, you can execute a Phabricator lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.phabricator.tasks import incremental_phabricator_lister logging.basicConfig(level=logging.DEBUG) incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX') ``` ## lister-gnu Once configured, you can execute a PyPI lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.gnu.tasks import gnu_lister logging.basicConfig(level=logging.DEBUG) gnu_lister() ## lister-cran Once configured, you can execute a RCRAN lister using the following instructions in a `python3` script: ```lang=python import logging from swh.lister.cran.tasks import cran_lister logging.basicConfig(level=logging.DEBUG) cran_lister() ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh/lister/_version.py b/swh/lister/_version.py index d706c27..0ed3d3b 100644 --- a/swh/lister/_version.py +++ b/swh/lister/_version.py @@ -1,5 +1,5 @@ # This file is automatically generated by setup.py. -__version__ = '0.0.25' -__sha__ = 'g72e208a' -__revision__ = 'g72e208a' +__version__ = '0.0.26' +__sha__ = 'g6d11705' +__revision__ = 'g6d11705' diff --git a/swh/lister/core/lister_transports.py b/swh/lister/core/lister_transports.py index 75bc6ad..a2202bd 100644 --- a/swh/lister/core/lister_transports.py +++ b/swh/lister/core/lister_transports.py @@ -1,207 +1,222 @@ # Copyright (C) 2017-2018 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import abc import random from datetime import datetime from email.utils import parsedate from pprint import pformat import logging import requests import xmltodict try: from swh.lister._version import __version__ except ImportError: __version__ = 'devel' from .abstractattribute import AbstractAttribute from .lister_base import FetchError logger = logging.getLogger(__name__) class SWHListerHttpTransport(abc.ABC): """Use the Requests library for making Lister endpoint requests. To be used in conjunction with SWHListerBase or a subclass of it. """ PATH_TEMPLATE = AbstractAttribute('string containing a python string' ' format pattern that produces the API' ' endpoint path for listing stored' ' repositories when given an index.' ' eg. "/repositories?after=%s".' 'To be implemented in the API-specific' ' class inheriting this.') EXPECTED_STATUS_CODES = (200, 429, 403, 404) def request_headers(self): """Returns dictionary of any request headers needed by the server. MAY BE OVERRIDDEN if request headers are needed. """ return { 'User-Agent': 'Software Heritage lister (%s)' % self.lister_version } + def request_instance_credentials(self): + """Returns dictionary of any credentials configuration needed by the + forge instance to list. + + Returns: + dict of credentials per instance lister or {} if none. + + """ + all_creds = self.config.get('credentials') + if not all_creds: + return {} + lister_creds = all_creds.get(self.LISTER_NAME, {}) + creds = lister_creds.get(self.instance, {}) + return creds + def request_uri(self, identifier): """Get the full request URI given the transport_request identifier. MAY BE OVERRIDDEN if something more complex than the PATH_TEMPLATE is required. """ path = self.PATH_TEMPLATE % identifier return self.api_baseurl + path def request_params(self, identifier): """Get the full parameters passed to requests given the transport_request identifier. This uses credentials if any are provided. The 'credentials' configuration is expected to be a dict of multiple levels. The first level is the lister's name, the second is the lister's instance name. For example: credentials: github: # github lister github: # has only one instance (so far) - username: some password: somekey - username: one password: onekey - ... gitlab: # gitlab lister riseup: # has many instances - username: someone password: ... - ... gitlab: - username: someone password: ... - ... MAY BE OVERRIDDEN if something more complex than the request headers is needed. """ params = {} params['headers'] = self.request_headers() or {} - all_creds = self.config['credentials'] - lister_creds = all_creds.get(self.LISTER_NAME, {}) - creds = lister_creds.get(self.instance, {}) + creds = self.request_instance_credentials() + if not creds: + return params auth = random.choice(creds) if creds else None if auth: params['auth'] = (auth['username'], auth['password']) return params def transport_quota_check(self, response): """Implements SWHListerBase.transport_quota_check with standard 429 code check for HTTP with Requests library. MAY BE OVERRIDDEN if the server notifies about rate limits in a non-standard way that doesn't use HTTP 429 and the Retry-After response header. ( https://tools.ietf.org/html/rfc6585#section-4 ) """ if response.status_code == 429: # HTTP too many requests retry_after = response.headers.get('Retry-After', self.back_off()) try: # might be seconds return True, float(retry_after) except Exception: # might be http-date at_date = datetime(*parsedate(retry_after)[:6]) from_now = (at_date - datetime.today()).total_seconds() + 5 return True, max(0, from_now) else: # response ok self.reset_backoff() return False, 0 def __init__(self, api_baseurl=None): if not api_baseurl: raise NameError('HTTP Lister Transport requires api_baseurl.') self.api_baseurl = api_baseurl # eg. 'https://api.github.com' self.session = requests.Session() self.lister_version = __version__ def _transport_action(self, identifier, method='get'): """Permit to ask information to the api prior to actually executing query. """ path = self.request_uri(identifier) params = self.request_params(identifier) try: if method == 'head': response = self.session.head(path, **params) else: response = self.session.get(path, **params) except requests.exceptions.ConnectionError as e: logger.warning('Failed to fetch %s: %s', path, e) raise FetchError(e) else: if response.status_code not in self.EXPECTED_STATUS_CODES: raise FetchError(response) return response def transport_head(self, identifier): """Retrieve head information on api. """ return self._transport_action(identifier, method='head') def transport_request(self, identifier): """Implements SWHListerBase.transport_request for HTTP using Requests. Retrieve get information on api. """ return self._transport_action(identifier) def transport_response_to_string(self, response): """Implements SWHListerBase.transport_response_to_string for HTTP given Requests responses. """ s = pformat(response.request.path_url) s += '\n#\n' + pformat(response.request.headers) s += '\n#\n' + pformat(response.status_code) s += '\n#\n' + pformat(response.headers) s += '\n#\n' try: # json? s += pformat(response.json()) except Exception: # not json try: # xml? s += pformat(xmltodict.parse(response.text)) except Exception: # not xml s += pformat(response.text) return s class ListerOnePageApiTransport(SWHListerHttpTransport): """Leverage requests library to retrieve basic html page and parse result. To be used in conjunction with SWHListerBase or a subclass of it. """ PAGE = AbstractAttribute("The server api's unique page to retrieve and " "parse for information") PATH_TEMPLATE = None # we do not use it def __init__(self, api_baseurl=None): self.session = requests.Session() self.lister_version = __version__ def request_uri(self, _): """Get the full request URI given the transport_request identifier. """ return self.PAGE diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py index 41cb7c6..e463fc4 100644 --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -1,113 +1,84 @@ # Copyright (C) 2018-2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import random import time from urllib3.util import parse_url from ..core.page_by_page_lister import PageByPageHttpLister from .models import GitLabModel class GitLabLister(PageByPageHttpLister): # Template path expecting an integer that represents the page id PATH_TEMPLATE = '/projects?page=%d&order_by=id' MODEL = GitLabModel LISTER_NAME = 'gitlab' def __init__(self, api_baseurl, instance=None, override_config=None, sort='asc', per_page=20): super().__init__(api_baseurl=api_baseurl, override_config=override_config) if instance is None: instance = parse_url(api_baseurl).host self.instance = instance self.PATH_TEMPLATE = '%s&sort=%s' % (self.PATH_TEMPLATE, sort) if per_page != 20: self.PATH_TEMPLATE = '%s&per_page=%s' % ( self.PATH_TEMPLATE, per_page) - def request_params(self, identifier): - """Get the full parameters passed to requests given the - transport_request identifier. - - For the gitlab lister, the 'credentials' entries is configured - per instance. For example:: - - - credentials: - - gitlab.com: - - username: user0 - password: - - username: user1 - password: - - ... - - other-gitlab-instance: - ... - - """ - params = { - 'headers': self.request_headers() or {} - } - creds_lister = self.config['credentials'].get(self.instance) - if creds_lister: - auth = random.choice(creds_lister) - if auth: - params['auth'] = (auth['username'], auth['password']) - return params - def uid(self, repo): return '%s/%s' % (self.instance, repo['path_with_namespace']) def get_model_from_repo(self, repo): return { 'instance': self.instance, 'uid': self.uid(repo), 'name': repo['name'], 'full_name': repo['path_with_namespace'], 'html_url': repo['web_url'], 'origin_url': repo['http_url_to_repo'], 'origin_type': 'git', 'description': repo['description'], } def transport_quota_check(self, response): """Deal with rate limit if any. """ # not all gitlab instance have rate limit if 'RateLimit-Remaining' in response.headers: reqs_remaining = int(response.headers['RateLimit-Remaining']) if response.status_code == 403 and reqs_remaining == 0: reset_at = int(response.headers['RateLimit-Reset']) delay = min(reset_at - time.time(), 3600) return True, delay return False, 0 def _get_int(self, headers, key): _val = headers.get(key) if _val: return int(_val) def get_next_target_from_response(self, response): """Determine the next page identifier. """ return self._get_int(response.headers, 'x-next-page') def get_pages_information(self): """Determine pages information. """ response = self.transport_head(identifier=1) if not response.ok: raise ValueError( 'Problem during information fetch: %s' % response.status_code) h = response.headers return (self._get_int(h, 'x-total'), self._get_int(h, 'x-total-pages'), self._get_int(h, 'x-per-page')) def transport_response_simplified(self, response): repos = response.json() return [self.get_model_from_repo(repo) for repo in repos] diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py index c02103d..7009c10 100644 --- a/swh/lister/phabricator/lister.py +++ b/swh/lister/phabricator/lister.py @@ -1,143 +1,178 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import logging + import urllib.parse from swh.lister.core.indexing_lister import SWHIndexingHttpLister from swh.lister.phabricator.models import PhabricatorModel from collections import defaultdict +logger = logging.getLogger(__name__) -class PhabricatorLister(SWHIndexingHttpLister): - PATH_TEMPLATE = '&order=oldest&attachments[uris]=1&after=%s' +class PhabricatorLister(SWHIndexingHttpLister): + PATH_TEMPLATE = '?order=oldest&attachments[uris]=1&after=%s' MODEL = PhabricatorModel LISTER_NAME = 'phabricator' - def __init__(self, forge_url, api_token, instance=None, + def __init__(self, forge_url, instance=None, api_token=None, override_config=None): if forge_url.endswith("/"): forge_url = forge_url[:-1] self.forge_url = forge_url - api_endpoint = ('api/diffusion.repository.' - 'search?api.token=%s') % api_token - api_baseurl = '%s/%s' % (forge_url, api_endpoint) + api_baseurl = '%s/api/diffusion.repository.search' % forge_url + self.api_token = api_token if not instance: instance = urllib.parse.urlparse(forge_url).hostname self.instance = instance super().__init__(api_baseurl=api_baseurl, override_config=override_config) + def _build_query_params(self, params, api_token): + """Build query params to include the forge's api token + + Returns: + updated params dict with 'params' entry. + + """ + params.update({'params': {'api.token': api_token}}) + return params + + def request_params(self, identifier): + """Override the default params behavior to retrieve the api token + + Credentials are stored as: + + credentials: + phabricator: + : + - username: + password: + + """ + params = {} + params['headers'] = self.request_headers() or {} + if self.api_token: + return self._build_query_params(params, self.api_token) + instance_creds = self.request_instance_credentials() + if not instance_creds: + raise ValueError( + 'Phabricator forge needs authentication credential to list.') + api_token = instance_creds[0]['password'] + return self._build_query_params(params, api_token) + def request_headers(self): """ (Override) Set requests headers to send when querying the Phabricator API """ return {'User-Agent': 'Software Heritage phabricator lister', 'Accept': 'application/json'} def get_model_from_repo(self, repo): url = get_repo_url(repo['attachments']['uris']['uris']) if url is None: return None return { 'uid': self.forge_url + str(repo['id']), 'indexable': repo['id'], 'name': repo['fields']['shortName'], 'full_name': repo['fields']['name'], 'html_url': url, 'origin_url': url, 'description': None, 'origin_type': repo['fields']['vcs'] } def get_next_target_from_response(self, response): body = response.json()['result']['cursor'] if body['after'] != 'null': return body['after'] else: return None def transport_response_simplified(self, response): repos = response.json() if repos['result'] is None: raise ValueError( 'Problem during information fetch: %s' % repos['error_code']) repos = repos['result']['data'] return [self.get_model_from_repo(repo) for repo in repos] def filter_before_inject(self, models_list): """ (Overrides) SWHIndexingLister.filter_before_inject Bounds query results by this Lister's set max_index. """ models_list = [m for m in models_list if m is not None] return super().filter_before_inject(models_list) def _bootstrap_repositories_listing(self): """ Method called when no min_bound value has been provided to the lister. Its purpose is to: 1. get the first repository data hosted on the Phabricator instance 2. inject them into the lister database 3. return the first repository index to start the listing after that value Returns: int: The first repository index """ params = '&order=oldest&limit=1' response = self.safely_issue_request(params) models_list = self.transport_response_simplified(response) self.max_index = models_list[0]['indexable'] models_list = self.filter_before_inject(models_list) injected = self.inject_repo_data_into_db(models_list) self.schedule_missing_tasks(models_list, injected) return self.max_index def run(self, min_bound=None, max_bound=None): """ (Override) Run the lister on the specified Phabricator instance Args: min_bound (int): Optional repository index to start the listing after it max_bound (int): Optional repository index to stop the listing after it """ # initial call to the lister, we need to bootstrap it in that case if min_bound is None: min_bound = self._bootstrap_repositories_listing() super().run(min_bound, max_bound) def get_repo_url(attachments): """ Return url for a hosted repository from its uris attachments according to the following priority lists: * protocol: https > http * identifier: shortname > callsign > id """ processed_urls = defaultdict(dict) for uri in attachments: protocol = uri['fields']['builtin']['protocol'] url = uri['fields']['uri']['effective'] identifier = uri['fields']['builtin']['identifier'] if protocol in ('http', 'https'): processed_urls[protocol][identifier] = url elif protocol is None: for protocol in ('https', 'http'): if url.startswith(protocol): processed_urls[protocol]['undefined'] = url break for protocol in ['https', 'http']: for identifier in ['shortname', 'callsign', 'id', 'undefined']: if (protocol in processed_urls and identifier in processed_urls[protocol]): return processed_urls[protocol][identifier] return None diff --git a/swh/lister/phabricator/tasks.py b/swh/lister/phabricator/tasks.py index 3ebb981..ce37fa4 100644 --- a/swh/lister/phabricator/tasks.py +++ b/swh/lister/phabricator/tasks.py @@ -1,29 +1,29 @@ # Copyright (C) 2019 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.scheduler.celery_backend.config import app from swh.lister.phabricator.lister import PhabricatorLister -def new_lister(forge_url='https://forge.softwareheritage.org', api_token='', - instance='swh', **kw): - return PhabricatorLister(forge_url=forge_url, api_token=api_token, - instance=instance, **kw) +def new_lister(forge_url='https://forge.softwareheritage.org', instance='swh', + api_token=None, **kw): + return PhabricatorLister( + forge_url=forge_url, instance=instance, api_token=api_token, **kw) @app.task(name=__name__ + '.IncrementalPhabricatorLister') def incremental_phabricator_lister(**lister_args): lister = new_lister(**lister_args) lister.run(min_bound=lister.db_last_index()) @app.task(name=__name__ + '.FullPhabricatorLister') def full_phabricator_lister(**lister_args): lister = new_lister(**lister_args) lister.run() @app.task(name=__name__ + '.ping') def ping(): return 'OK' diff --git a/swh/lister/phabricator/tests/test_tasks.py b/swh/lister/phabricator/tests/test_tasks.py index a97196f..0aa27d6 100644 --- a/swh/lister/phabricator/tests/test_tasks.py +++ b/swh/lister/phabricator/tests/test_tasks.py @@ -1,30 +1,30 @@ from unittest.mock import patch def test_ping(swh_app, celery_session_worker): res = swh_app.send_task( 'swh.lister.phabricator.tasks.ping') assert res res.wait() assert res.successful() assert res.result == 'OK' @patch('swh.lister.phabricator.tasks.PhabricatorLister') def test_incremental(lister, swh_app, celery_session_worker): # setup the mocked PhabricatorLister lister.return_value = lister lister.db_last_index.return_value = 42 lister.run.return_value = None res = swh_app.send_task( 'swh.lister.phabricator.tasks.IncrementalPhabricatorLister') assert res res.wait() assert res.successful() lister.assert_called_once_with( - api_token='', forge_url='https://forge.softwareheritage.org', + api_token=None, forge_url='https://forge.softwareheritage.org', instance='swh') lister.db_last_index.assert_called_once_with() lister.run.assert_called_once_with(min_bound=42) diff --git a/version.txt b/version.txt index b1c39c9..3a5f0dc 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.25-0-g72e208a \ No newline at end of file +v0.0.26-0-g6d11705 \ No newline at end of file