Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/gitlab/lister.py
# Copyright (C) 2018-2022 The Software Heritage developers | # Copyright (C) 2018-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from dataclasses import asdict, dataclass | from dataclasses import asdict, dataclass | ||||
import logging | import logging | ||||
import random | import random | ||||
from typing import Any, Dict, Iterator, Optional, Tuple | from typing import Any, Dict, Iterator, List, Optional, Tuple | ||||
from urllib.parse import parse_qs, urlencode, urlparse | from urllib.parse import parse_qs, urlencode, urlparse | ||||
import iso8601 | import iso8601 | ||||
from requests.exceptions import HTTPError | from requests.exceptions import HTTPError | ||||
from requests.status_codes import codes | from requests.status_codes import codes | ||||
from tenacity.before_sleep import before_sleep_log | from tenacity.before_sleep import before_sleep_log | ||||
from swh.lister.pattern import CredentialsType, Lister | from swh.lister.pattern import CredentialsType, Lister | ||||
▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines | class GitLabLister(Lister[GitLabListerState, PageResult]): | ||||
Args: | Args: | ||||
scheduler: a scheduler instance | scheduler: a scheduler instance | ||||
url: the api v4 url of the gitlab instance to visit (e.g. | url: the api v4 url of the gitlab instance to visit (e.g. | ||||
https://gitlab.com/api/v4/) | https://gitlab.com/api/v4/) | ||||
instance: a specific instance name (e.g. gitlab, tor, git-kernel, ...), | instance: a specific instance name (e.g. gitlab, tor, git-kernel, ...), | ||||
url network location will be used if not provided | url network location will be used if not provided | ||||
incremental: defines if incremental listing is activated or not | incremental: defines if incremental listing is activated or not | ||||
ignored_project_prefixes: List of prefixes of project paths to ignore | |||||
""" | """ | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
scheduler, | scheduler, | ||||
url: str, | url: str, | ||||
name: Optional[str] = "gitlab", | name: Optional[str] = "gitlab", | ||||
instance: Optional[str] = None, | instance: Optional[str] = None, | ||||
credentials: Optional[CredentialsType] = None, | credentials: Optional[CredentialsType] = None, | ||||
incremental: bool = False, | incremental: bool = False, | ||||
ignored_project_prefixes: Optional[List[str]] = None, | |||||
): | ): | ||||
if name is not None: | if name is not None: | ||||
self.LISTER_NAME = name | self.LISTER_NAME = name | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
url=url.rstrip("/"), | url=url.rstrip("/"), | ||||
instance=instance, | instance=instance, | ||||
credentials=credentials, | credentials=credentials, | ||||
) | ) | ||||
self.incremental = incremental | self.incremental = incremental | ||||
self.last_page: Optional[str] = None | self.last_page: Optional[str] = None | ||||
self.per_page = 100 | self.per_page = 100 | ||||
self.ignored_project_prefixes: Optional[Tuple[str, ...]] = None | |||||
if ignored_project_prefixes: | |||||
self.ignored_project_prefixes = tuple(ignored_project_prefixes) | |||||
self.session.headers.update({"Accept": "application/json"}) | self.session.headers.update({"Accept": "application/json"}) | ||||
if len(self.credentials) > 0: | if len(self.credentials) > 0: | ||||
cred = random.choice(self.credentials) | cred = random.choice(self.credentials) | ||||
logger.info( | logger.info( | ||||
"Using %s credentials from user %s", self.instance, cred["username"] | "Using %s credentials from user %s", self.instance, cred["username"] | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines | def get_pages(self) -> Iterator[PageResult]: | ||||
yield page_result | yield page_result | ||||
next_page = page_result.next_page | next_page = page_result.next_page | ||||
def get_origins_from_page(self, page_result: PageResult) -> Iterator[ListedOrigin]: | def get_origins_from_page(self, page_result: PageResult) -> Iterator[ListedOrigin]: | ||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
repositories = page_result.repositories if page_result.repositories else [] | repositories = page_result.repositories if page_result.repositories else [] | ||||
for repo in repositories: | for repo in repositories: | ||||
if self.ignored_project_prefixes and repo["path_with_namespace"].startswith( | |||||
self.ignored_project_prefixes | |||||
): | |||||
continue | |||||
visit_type = repo.get("vcs_type", "git") | visit_type = repo.get("vcs_type", "git") | ||||
visit_type = VCS_MAPPING.get(visit_type, visit_type) | visit_type = VCS_MAPPING.get(visit_type, visit_type) | ||||
yield ListedOrigin( | yield ListedOrigin( | ||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||
url=repo["http_url_to_repo"], | url=repo["http_url_to_repo"], | ||||
visit_type=visit_type, | visit_type=visit_type, | ||||
last_update=iso8601.parse_date(repo["last_activity_at"]), | last_update=iso8601.parse_date(repo["last_activity_at"]), | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 47 Lines • Show Last 20 Lines |