Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/phabricator/lister.py
# Copyright (C) 2019 the Software Heritage developers | # Copyright (C) 2019 the Software Heritage developers | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import logging | import logging | ||||
import random | import random | ||||
import urllib.parse | import urllib.parse | ||||
from collections import defaultdict | from collections import defaultdict | ||||
from sqlalchemy import func | from sqlalchemy import func | ||||
from swh.lister.core.indexing_lister import IndexingHttpLister | from swh.lister.core.indexing_lister import IndexingHttpLister | ||||
from swh.lister.phabricator.models import PhabricatorModel | from swh.lister.phabricator.models import PhabricatorModel | ||||
from typing import Any, Dict, List, Optional | |||||
from requests import Response | |||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
class PhabricatorLister(IndexingHttpLister): | class PhabricatorLister(IndexingHttpLister): | ||||
PATH_TEMPLATE = '?order=oldest&attachments[uris]=1&after=%s' | PATH_TEMPLATE = '?order=oldest&attachments[uris]=1&after=%s' | ||||
DEFAULT_URL = \ | DEFAULT_URL = \ | ||||
'https://forge.softwareheritage.org/api/diffusion.repository.search' | 'https://forge.softwareheritage.org/api/diffusion.repository.search' | ||||
MODEL = PhabricatorModel | MODEL = PhabricatorModel | ||||
LISTER_NAME = 'phabricator' | LISTER_NAME = 'phabricator' | ||||
def __init__(self, url=None, instance=None, override_config=None): | def __init__(self, url=None, instance=None, override_config=None): | ||||
super().__init__(url=url, override_config=override_config) | super().__init__(url=url, override_config=override_config) | ||||
if not instance: | if not instance: | ||||
instance = urllib.parse.urlparse(self.url).hostname | instance = urllib.parse.urlparse(self.url).hostname | ||||
self.instance = instance | self.instance = instance | ||||
def request_params(self, identifier): | def request_params(self, identifier: int) -> Dict[str, Any]: | ||||
"""Override the default params behavior to retrieve the api token | """Override the default params behavior to retrieve the api token | ||||
Credentials are stored as: | Credentials are stored as: | ||||
credentials: | credentials: | ||||
phabricator: | phabricator: | ||||
<instance>: | <instance>: | ||||
- username: <account> | - username: <account> | ||||
Show All 13 Lines | def request_headers(self): | ||||
""" | """ | ||||
(Override) Set requests headers to send when querying the | (Override) Set requests headers to send when querying the | ||||
Phabricator API | Phabricator API | ||||
""" | """ | ||||
headers = super().request_headers() | headers = super().request_headers() | ||||
headers['Accept'] = 'application/json' | headers['Accept'] = 'application/json' | ||||
return headers | return headers | ||||
def get_model_from_repo(self, repo): | def get_model_from_repo( | ||||
self, repo: Dict[str, Any]) -> Optional[Dict[str, Any]]: | |||||
url = get_repo_url(repo['attachments']['uris']['uris']) | url = get_repo_url(repo['attachments']['uris']['uris']) | ||||
if url is None: | if url is None: | ||||
return None | return None | ||||
return { | return { | ||||
'uid': url, | 'uid': url, | ||||
'indexable': repo['id'], | 'indexable': repo['id'], | ||||
'name': repo['fields']['shortName'], | 'name': repo['fields']['shortName'], | ||||
'full_name': repo['fields']['name'], | 'full_name': repo['fields']['name'], | ||||
'html_url': url, | 'html_url': url, | ||||
'origin_url': url, | 'origin_url': url, | ||||
'origin_type': repo['fields']['vcs'], | 'origin_type': repo['fields']['vcs'], | ||||
'instance': self.instance, | 'instance': self.instance, | ||||
} | } | ||||
def get_next_target_from_response(self, response): | def get_next_target_from_response( | ||||
self, response: Response) -> Optional[int]: | |||||
body = response.json()['result']['cursor'] | body = response.json()['result']['cursor'] | ||||
if body['after'] and body['after'] != 'null': | if body['after'] and body['after'] != 'null': | ||||
return int(body['after']) | return int(body['after']) | ||||
return None | |||||
def transport_response_simplified(self, response): | def transport_response_simplified( | ||||
self, response: Response) -> List[Optional[Dict[str, Any]]]: | |||||
repos = response.json() | repos = response.json() | ||||
if repos['result'] is None: | if repos['result'] is None: | ||||
raise ValueError( | raise ValueError( | ||||
'Problem during information fetch: %s' % repos['error_code']) | 'Problem during information fetch: %s' % repos['error_code']) | ||||
repos = repos['result']['data'] | repos = repos['result']['data'] | ||||
return [self.get_model_from_repo(repo) for repo in repos] | return [self.get_model_from_repo(repo) for repo in repos] | ||||
def filter_before_inject(self, models_list): | def filter_before_inject(self, models_list): | ||||
vlorentz: missing types | |||||
""" | """ | ||||
(Overrides) IndexingLister.filter_before_inject | (Overrides) IndexingLister.filter_before_inject | ||||
Bounds query results by this Lister's set max_index. | Bounds query results by this Lister's set max_index. | ||||
""" | """ | ||||
models_list = [m for m in models_list if m is not None] | models_list = [m for m in models_list if m is not None] | ||||
return super().filter_before_inject(models_list) | return super().filter_before_inject(models_list) | ||||
def disable_deleted_repo_tasks(self, index, next_index, keep_these): | def disable_deleted_repo_tasks( | ||||
self, index: int, next_index: int, keep_these: str): | |||||
""" | """ | ||||
(Overrides) Fix provided index value to avoid: | (Overrides) Fix provided index value to avoid: | ||||
- database query error | - database query error | ||||
- erroneously disabling some scheduler tasks | - erroneously disabling some scheduler tasks | ||||
""" | """ | ||||
# First call to the Phabricator API uses an empty 'after' parameter, | # First call to the Phabricator API uses an empty 'after' parameter, | ||||
# so set the index to 0 to avoid database query error | # so set the index to 0 to avoid database query error | ||||
if index == '': | if index == '': | ||||
index = 0 | index = 0 | ||||
# Next listed repository ids are strictly greater than the 'after' | # Next listed repository ids are strictly greater than the 'after' | ||||
# parameter, so increment the index to avoid disabling the latest | # parameter, so increment the index to avoid disabling the latest | ||||
# created task when processing a new repositories page returned by | # created task when processing a new repositories page returned by | ||||
# the Phabricator API | # the Phabricator API | ||||
else: | else: | ||||
index = index + 1 | index = index + 1 | ||||
return super().disable_deleted_repo_tasks(index, next_index, | return super().disable_deleted_repo_tasks(index, next_index, | ||||
keep_these) | keep_these) | ||||
def db_first_index(self): | def db_first_index(self) -> Optional[int]: | ||||
""" | """ | ||||
(Overrides) Filter results by Phabricator instance | (Overrides) Filter results by Phabricator instance | ||||
Returns: | Returns: | ||||
the smallest indexable value of all repos in the db | the smallest indexable value of all repos in the db | ||||
""" | """ | ||||
t = self.db_session.query(func.min(self.MODEL.indexable)) | t = self.db_session.query(func.min(self.MODEL.indexable)) | ||||
t = t.filter(self.MODEL.instance == self.instance).first() | t = t.filter(self.MODEL.instance == self.instance).first() | ||||
if t: | if t: | ||||
return t[0] | return t[0] | ||||
return None | |||||
def db_last_index(self): | def db_last_index(self): | ||||
""" | """ | ||||
(Overrides) Filter results by Phabricator instance | (Overrides) Filter results by Phabricator instance | ||||
Returns: | Returns: | ||||
the largest indexable value of all instance repos in the db | the largest indexable value of all instance repos in the db | ||||
""" | """ | ||||
t = self.db_session.query(func.max(self.MODEL.indexable)) | t = self.db_session.query(func.max(self.MODEL.indexable)) | ||||
t = t.filter(self.MODEL.instance == self.instance).first() | t = t.filter(self.MODEL.instance == self.instance).first() | ||||
if t: | if t: | ||||
return t[0] | return t[0] | ||||
def db_query_range(self, start, end): | def db_query_range(self, start: int, end: int): | ||||
Not Done Inline Actionsmissing return type vlorentz: missing return type | |||||
""" | """ | ||||
(Overrides) Filter the results by the Phabricator instance to | (Overrides) Filter the results by the Phabricator instance to | ||||
avoid disabling loading tasks for repositories hosted on a | avoid disabling loading tasks for repositories hosted on a | ||||
different instance. | different instance. | ||||
Returns: | Returns: | ||||
a list of sqlalchemy.ext.declarative.declarative_base objects | a list of sqlalchemy.ext.declarative.declarative_base objects | ||||
with indexable values within the given range for the instance | with indexable values within the given range for the instance | ||||
""" | """ | ||||
retlist = super().db_query_range(start, end) | retlist = super().db_query_range(start, end) | ||||
return retlist.filter(self.MODEL.instance == self.instance) | return retlist.filter(self.MODEL.instance == self.instance) | ||||
def get_repo_url(attachments): | def get_repo_url(attachments: List[Dict[str, Any]]) -> Optional[int]: | ||||
""" | """ | ||||
Return url for a hosted repository from its uris attachments according | Return url for a hosted repository from its uris attachments according | ||||
to the following priority lists: | to the following priority lists: | ||||
* protocol: https > http | * protocol: https > http | ||||
* identifier: shortname > callsign > id | * identifier: shortname > callsign > id | ||||
""" | """ | ||||
processed_urls = defaultdict(dict) | processed_urls = defaultdict(dict) # type: Dict[str, Any] | ||||
for uri in attachments: | for uri in attachments: | ||||
protocol = uri['fields']['builtin']['protocol'] | protocol = uri['fields']['builtin']['protocol'] | ||||
url = uri['fields']['uri']['effective'] | url = uri['fields']['uri']['effective'] | ||||
identifier = uri['fields']['builtin']['identifier'] | identifier = uri['fields']['builtin']['identifier'] | ||||
if protocol in ('http', 'https'): | if protocol in ('http', 'https'): | ||||
processed_urls[protocol][identifier] = url | processed_urls[protocol][identifier] = url | ||||
elif protocol is None: | elif protocol is None: | ||||
for protocol in ('https', 'http'): | for protocol in ('https', 'http'): | ||||
Show All 9 Lines |
missing types