Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/bitbucket/lister.py
# Copyright (C) 2017-2019 The Software Heritage developers | # Copyright (C) 2017-2021 The Software Heritage developers | ||||||||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||||||||
from datetime import datetime, timezone | from dataclasses import asdict, dataclass | ||||||||||
from datetime import datetime | |||||||||||
import logging | import logging | ||||||||||
from typing import Any, Dict, List, Optional | from typing import Any, Dict, Iterator, List, Optional | ||||||||||
from urllib import parse | from urllib import parse | ||||||||||
import iso8601 | import iso8601 | ||||||||||
from requests import Response | import requests | ||||||||||
from tenacity.before_sleep import before_sleep_log | |||||||||||
from swh.lister.bitbucket.models import BitBucketModel | from swh.lister.utils import throttling_retry | ||||||||||
ardumont: Do we entirely drop the old bitbucket lister code?
If yes, you can also remove that module. | |||||||||||
from swh.lister.core.indexing_lister import IndexingHttpLister | from swh.scheduler.interface import SchedulerInterface | ||||||||||
from swh.scheduler.model import ListedOrigin | |||||||||||
from .. import USER_AGENT | |||||||||||
from ..pattern import CredentialsType, Lister | |||||||||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||||||
class BitBucketLister(IndexingHttpLister): | @dataclass | ||||||||||
PATH_TEMPLATE = "/repositories?after=%s" | class BitbucketListerState: | ||||||||||
MODEL = BitBucketModel | """State of Bitbucket lister""" | ||||||||||
last_repo_cdate: Optional[datetime] = None | |||||||||||
"""Creation date and time of the last listed repository during an | |||||||||||
incremental pass""" | |||||||||||
class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]): | |||||||||||
"""List origins from Bitbucket using its REST API. | |||||||||||
Bitbucket API has the following rate-limit configuration: | |||||||||||
* 60 requests per hour for anonymous users | |||||||||||
* 1000 requests per hour for authenticated users | |||||||||||
The lister is working in anonymous mode by default but Bitbucket account | |||||||||||
credentials can be provided to perform authenticated requests. | |||||||||||
""" | |||||||||||
LISTER_NAME = "bitbucket" | LISTER_NAME = "bitbucket" | ||||||||||
DEFAULT_URL = "https://api.bitbucket.org/2.0" | INSTANCE = "bitbucket" | ||||||||||
instance = "bitbucket" | |||||||||||
default_min_bound = datetime.fromtimestamp(0, timezone.utc) # type: Any | API_URL = "https://api.bitbucket.org/2.0/repositories" | ||||||||||
def __init__( | def __init__( | ||||||||||
self, url: str = None, override_config=None, per_page: int = 100 | self, | ||||||||||
) -> None: | scheduler: SchedulerInterface, | ||||||||||
super().__init__(url=url, override_config=override_config) | page_size: int = 1000, | ||||||||||
per_page = self.config.get("per_page", per_page) | incremental: bool = True, | ||||||||||
credentials: CredentialsType = None, | |||||||||||
self.PATH_TEMPLATE = "%s&pagelen=%s" % (self.PATH_TEMPLATE, per_page) | ): | ||||||||||
super().__init__( | |||||||||||
def get_model_from_repo(self, repo: Dict) -> Dict[str, Any]: | scheduler=scheduler, | ||||||||||
return { | credentials=credentials, | ||||||||||
"uid": repo["uuid"], | url=self.API_URL, | ||||||||||
"indexable": iso8601.parse_date(repo["created_on"]), | instance=self.INSTANCE, | ||||||||||
"name": repo["name"], | ) | ||||||||||
"full_name": repo["full_name"], | |||||||||||
"html_url": repo["links"]["html"]["href"], | self.incremental = incremental | ||||||||||
"origin_url": repo["links"]["clone"][0]["href"], | |||||||||||
"origin_type": repo["scm"], | self.url_params = { | ||||||||||
"pagelen": page_size, | |||||||||||
# only return needed JSON fields in bitbucket API responses | |||||||||||
# (also prevent errors 500 when listing) | |||||||||||
"fields": ( | |||||||||||
"next,values.links.clone.href,values.scm,values.updated_on," | |||||||||||
"values.created_on" | |||||||||||
), | |||||||||||
} | } | ||||||||||
def get_next_target_from_response(self, response: Response) -> Optional[datetime]: | self.session = requests.Session() | ||||||||||
"""This will read the 'next' link from the api response if any | self.session.headers.update( | ||||||||||
and return it as a datetime. | {"Accept": "application/json", "User-Agent": USER_AGENT} | ||||||||||
) | |||||||||||
Args: | |||||||||||
response (Response): requests' response from api call | if len(self.credentials) > 0: | ||||||||||
if len(self.credentials) > 1: | |||||||||||
logger.warning( | |||||||||||
"Bitbucket lister support only one username:password" | |||||||||||
Done Inline Actionsmaybe a warning if there is more than one set of credentials? vlorentz: maybe a warning if there is more than one set of credentials? | |||||||||||
" pair as of now. Will use the first one." | |||||||||||
) | |||||||||||
cred = self.credentials[0] | |||||||||||
self.set_credentials(cred["username"], cred["password"]) | |||||||||||
def state_from_dict(self, d: Dict[str, Any]) -> BitbucketListerState: | |||||||||||
last_repo_cdate = d.get("last_repo_cdate") | |||||||||||
if last_repo_cdate is not None: | |||||||||||
d["last_repo_cdate"] = iso8601.parse_date(last_repo_cdate) | |||||||||||
return BitbucketListerState(**d) | |||||||||||
def state_to_dict(self, state: BitbucketListerState) -> Dict[str, Any]: | |||||||||||
d = asdict(state) | |||||||||||
last_repo_cdate = d.get("last_repo_cdate") | |||||||||||
Done Inline ActionsShouldn't these be (class)methods of BitbucketListerState? vlorentz: Shouldn't these be (class)methods of `BitbucketListerState`? | |||||||||||
Done Inline Actionsthe new lister API has them as instance methods of the lister, but indeed the API could be amended to have it as ListerState class methods. tenma: the new lister API has them as instance methods of the lister, but indeed the API could be… | |||||||||||
Done Inline Actionsyeah that makes sense, let's leave it like this then vlorentz: yeah that makes sense, let's leave it like this then | |||||||||||
if last_repo_cdate is not None: | |||||||||||
d["last_repo_cdate"] = last_repo_cdate.isoformat() | |||||||||||
Done Inline Actions
vlorentz: | |||||||||||
return d | |||||||||||
Done Inline ActionsCan you rename the function to set_credentials, shorter is better. Also parameters should be typed to Optional[str]. anlambert: Can you rename the function to `set_credentials`, shorter is better. Also parameters should be… | |||||||||||
def set_credentials(self, username: Optional[str], password: Optional[str]) -> None: | |||||||||||
"""Set basic authentication headers with given credentials.""" | |||||||||||
if username is not None and password is not None: | |||||||||||
Done Inline Actions
vlorentz: | |||||||||||
self.session.auth = (username, password) | |||||||||||
@throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) | |||||||||||
Done Inline Actionsyou can remove that parameter as it is already the default value of the decorator anlambert: you can remove that parameter as it is already the default value of the decorator | |||||||||||
def page_request(self, last_repo_cdate: str) -> requests.Response: | |||||||||||
Done Inline Actions
And why does it need to be Optional[bool] instead of just bool? vlorentz: And why does it need to be `Optional[bool]` instead of just `bool`? | |||||||||||
self.url_params["after"] = last_repo_cdate | |||||||||||
logger.debug("Fetching URL %s with params %s", self.url, self.url_params) | |||||||||||
response = self.session.get(self.url, params=self.url_params) | |||||||||||
if response.status_code != 200: | |||||||||||
logger.warning( | |||||||||||
"Unexpected HTTP status code %s on %s: %s", | |||||||||||
response.status_code, | |||||||||||
response.url, | |||||||||||
response.content, | |||||||||||
) | |||||||||||
response.raise_for_status() | |||||||||||
return response | |||||||||||
def get_pages(self) -> Iterator[List[Dict[str, Any]]]: | |||||||||||
last_repo_cdate: str = "1970-01-01" | |||||||||||
if ( | |||||||||||
self.incremental | |||||||||||
and self.state is not None | |||||||||||
and self.state.last_repo_cdate is not None | |||||||||||
): | |||||||||||
last_repo_cdate = self.state.last_repo_cdate.isoformat() | |||||||||||
while True: | |||||||||||
body = self.page_request(last_repo_cdate).json() | |||||||||||
yield body["values"] | |||||||||||
next_page_url = body.get("next") | |||||||||||
if next_page_url is not None: | |||||||||||
next_page_url = parse.urlparse(next_page_url) | |||||||||||
if not next_page_url.query: | |||||||||||
logger.warning("Failed to parse url %s", next_page_url) | |||||||||||
break | |||||||||||
last_repo_cdate = parse.parse_qs(next_page_url.query)["after"][0] | |||||||||||
else: | |||||||||||
# last page | |||||||||||
break | |||||||||||
Returns: | def get_origins_from_page( | ||||||||||
next date as a datetime | self, page: List[Dict[str, Any]] | ||||||||||
) -> Iterator[ListedOrigin]: | |||||||||||
"""Convert a page of Bitbucket repositories into a list of ListedOrigins. | |||||||||||
""" | """ | ||||||||||
body = response.json() | assert self.lister_obj.id is not None | ||||||||||
next_ = body.get("next") | |||||||||||
if next_ is not None: | for repo in page: | ||||||||||
next_ = parse.urlparse(next_) | last_update = iso8601.parse_date(repo["updated_on"]) | ||||||||||
return iso8601.parse_date(parse.parse_qs(next_.query)["after"][0]) | origin_url = repo["links"]["clone"][0]["href"] | ||||||||||
return None | origin_type = repo["scm"] | ||||||||||
def transport_response_simplified(self, response: Response) -> List[Dict[str, Any]]: | yield ListedOrigin( | ||||||||||
repos = response.json()["values"] | lister_id=self.lister_obj.id, | ||||||||||
return [self.get_model_from_repo(repo) for repo in repos] | url=origin_url, | ||||||||||
visit_type=origin_type, | |||||||||||
def request_uri(self, identifier: datetime) -> str: # type: ignore | last_update=last_update, | ||||||||||
identifier_str = parse.quote(identifier.isoformat()) | ) | ||||||||||
Done Inline ActionsIMO this yield should be moved before if next_page_url is not None for two reasons:
vlorentz: IMO this `yield` should be moved before `if next_page_url is not None` for two reasons:
1. we… | |||||||||||
return super().request_uri(identifier_str or "1970-01-01") | |||||||||||
Not Done Inline Actions
vlorentz: | |||||||||||
def commit_page(self, page: List[Dict[str, Any]]) -> None: | |||||||||||
def is_within_bounds( | """Update the currently stored state using the latest listed page.""" | ||||||||||
self, inner: int, lower: Optional[int] = None, upper: Optional[int] = None | if self.incremental: | ||||||||||
) -> bool: | last_repo = page[-1] | ||||||||||
# values are expected to be datetimes | last_repo_cdate = iso8601.parse_date(last_repo["created_on"]) | ||||||||||
if lower is None and upper is None: | |||||||||||
ret = True | if ( | ||||||||||
elif lower is None: | self.state.last_repo_cdate is None | ||||||||||
ret = inner <= upper # type: ignore | or last_repo_cdate > self.state.last_repo_cdate | ||||||||||
elif upper is None: | ): | ||||||||||
ret = inner >= lower | self.state.last_repo_cdate = last_repo_cdate | ||||||||||
else: | |||||||||||
ret = lower <= inner <= upper | def finalize(self) -> None: | ||||||||||
return ret | if self.incremental: | ||||||||||
scheduler_state = self.get_state_from_scheduler() | |||||||||||
if self.state.last_repo_cdate is None: | |||||||||||
return | |||||||||||
# Update the lister state in the backend only if the last seen id of | |||||||||||
# the current run is higher than that stored in the database. | |||||||||||
if ( | |||||||||||
scheduler_state.last_repo_cdate is None | |||||||||||
or self.state.last_repo_cdate > scheduler_state.last_repo_cdate | |||||||||||
): | |||||||||||
self.updated = True |
Do we entirely drop the old bitbucket lister code?
If yes, you can also remove that module.