Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/launchpad/lister.py
# Copyright (C) 2020-2021 The Software Heritage developers | # Copyright (C) 2020-2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from dataclasses import dataclass | from dataclasses import dataclass | ||||
from datetime import datetime | from datetime import datetime | ||||
import logging | import logging | ||||
from typing import Any, Dict, Iterator, Optional | from typing import Any, Dict, Iterator, Optional, Tuple | ||||
import iso8601 | import iso8601 | ||||
from launchpadlib.launchpad import Launchpad | from launchpadlib.launchpad import Launchpad | ||||
from lazr.restfulclient.resource import Collection | from lazr.restfulclient.resource import Collection | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from ..pattern import CredentialsType, Lister | from ..pattern import CredentialsType, Lister | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
LaunchpadPageType = Iterator[Collection] | VcsType = str | ||||
LaunchpadPageType = Tuple[VcsType, Collection] | |||||
@dataclass | @dataclass | ||||
class LaunchpadListerState: | class LaunchpadListerState: | ||||
"""State of Launchpad lister""" | """State of Launchpad lister""" | ||||
date_last_modified: Optional[datetime] = None | git_date_last_modified: Optional[datetime] = None | ||||
ardumont: That means either i'll reset the state in the scheduling db or i'll alter the data when… | |||||
Not Done Inline ActionsI think altering the JSON data in the scheduler db should be a good move as we already listed plenty of git repos. 21:54 $ psql service=swh-scheduler psql (12.10 (Debian 12.10-1.pgdg110+1), server 12.9 (Debian 12.9-1.pgdg110+1)) SSL connection (protocol: TLSv1.3, cipher: TLS_AES_256_GCM_SHA384, bits: 256, compression: off) Type "help" for help. softwareheritage-scheduler=> select current_state from listers where name = 'launchpad'; current_state ------------------------------------------------------------ {"date_last_modified": "2022-02-16T19:32:09.400561+00:00"} (1 row) anlambert: I think altering the JSON data in the scheduler db should be a good move as we already listed… | |||||
Done Inline Actionsyes, i think so as well. ardumont: yes, i think so as well. | |||||
"""modification date of last updated repository since last listing""" | """modification date of last updated git repository since last listing""" | ||||
bzr_date_last_modified: Optional[datetime] = None | |||||
"""modification date of last updated bzr repository since last listing""" | |||||
def origin(vcs_type: str, repo: Any) -> str: | |||||
"""Determine the origin url out of a repository with a given vcs_type""" | |||||
return repo.git_https_url if vcs_type == "git" else repo.web_link | |||||
class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]): | class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]): | ||||
""" | """ | ||||
List git repositories from Launchpad. | List repositories from Launchpad (git or bzr). | ||||
Args: | Args: | ||||
scheduler: instance of SchedulerInterface | scheduler: instance of SchedulerInterface | ||||
incremental: defines if incremental listing should be used, in that case | incremental: defines if incremental listing should be used, in that case | ||||
only modified or new repositories since last incremental listing operation | only modified or new repositories since last incremental listing operation | ||||
will be returned | will be returned | ||||
""" | """ | ||||
LISTER_NAME = "launchpad" | LISTER_NAME = "launchpad" | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
scheduler: SchedulerInterface, | scheduler: SchedulerInterface, | ||||
incremental: bool = False, | incremental: bool = False, | ||||
credentials: CredentialsType = None, | credentials: CredentialsType = None, | ||||
): | ): | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
url="https://launchpad.net/", | url="https://launchpad.net/", | ||||
instance="launchpad", | instance="launchpad", | ||||
credentials=credentials, | credentials=credentials, | ||||
) | ) | ||||
self.incremental = incremental | self.incremental = incremental | ||||
self.date_last_modified = None | self.date_last_modified: Dict[str, Optional[datetime]] = { | ||||
"git": None, | |||||
"bzr": None, | |||||
} | |||||
def state_from_dict(self, d: Dict[str, Any]) -> LaunchpadListerState: | def state_from_dict(self, d: Dict[str, Any]) -> LaunchpadListerState: | ||||
date_last_modified = d.get("date_last_modified") | for vcs_type in ["git", "bzr"]: | ||||
key = f"{vcs_type}_date_last_modified" | |||||
date_last_modified = d.get(key) | |||||
if date_last_modified is not None: | if date_last_modified is not None: | ||||
d["date_last_modified"] = iso8601.parse_date(date_last_modified) | d[key] = iso8601.parse_date(date_last_modified) | ||||
Done Inline ActionsUse a tuple instead of a list. anlambert: Use a tuple instead of a list. | |||||
Done Inline Actionsardumont: D7196 | |||||
return LaunchpadListerState(**d) | return LaunchpadListerState(**d) | ||||
def state_to_dict(self, state: LaunchpadListerState) -> Dict[str, Any]: | def state_to_dict(self, state: LaunchpadListerState) -> Dict[str, Any]: | ||||
d: Dict[str, Optional[str]] = {"date_last_modified": None} | d: Dict[str, Optional[str]] = {} | ||||
Done Inline Actionssame here anlambert: same here | |||||
date_last_modified = state.date_last_modified | for vcs_type in ["git", "bzr"]: | ||||
attribute_name = f"{vcs_type}_date_last_modified" | |||||
d[attribute_name] = None | |||||
if hasattr(state, attribute_name): | |||||
date_last_modified = getattr(state, attribute_name) | |||||
if date_last_modified is not None: | if date_last_modified is not None: | ||||
d["date_last_modified"] = date_last_modified.isoformat() | d[attribute_name] = date_last_modified.isoformat() | ||||
return d | return d | ||||
def get_pages(self) -> Iterator[LaunchpadPageType]: | def get_pages(self) -> Iterator[LaunchpadPageType]: | ||||
""" | """ | ||||
Yields an iterator on all git repositories hosted on Launchpad sorted | Yields an iterator on all git/bzr repositories hosted on Launchpad sorted | ||||
by last modification date in ascending order. | by last modification date in ascending order. | ||||
""" | """ | ||||
launchpad = Launchpad.login_anonymously( | launchpad = Launchpad.login_anonymously( | ||||
"softwareheritage", "production", version="devel" | "softwareheritage", "production", version="devel" | ||||
) | ) | ||||
date_last_modified = None | |||||
if self.incremental: | if self.incremental: | ||||
date_last_modified = self.state.date_last_modified | self.date_last_modified = { | ||||
get_repos = launchpad.git_repositories.getRepositories | "git": self.state.git_date_last_modified, | ||||
yield get_repos( | "bzr": self.state.bzr_date_last_modified, | ||||
order_by="most neglected first", modified_since_date=date_last_modified | } | ||||
for vcs_type, get_vcs_fn in [ | |||||
("git", launchpad.git_repositories.getRepositories), | |||||
("bzr", launchpad.branches.getBranches), | |||||
]: | |||||
yield vcs_type, get_vcs_fn( | |||||
order_by="most neglected first", | |||||
modified_since_date=self.date_last_modified[vcs_type], | |||||
) | ) | ||||
def get_origins_from_page(self, page: LaunchpadPageType) -> Iterator[ListedOrigin]: | def get_origins_from_page(self, page: LaunchpadPageType) -> Iterator[ListedOrigin]: | ||||
""" | """ | ||||
Iterate on all git repositories and yield ListedOrigin instances. | Iterate on all git repositories and yield ListedOrigin instances. | ||||
""" | """ | ||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
prev_origin_url = None | prev_origin_url: Dict[str, Optional[str]] = {"git": None, "bzr": None} | ||||
vcs_type, repos = page | |||||
Not Done Inline ActionsWe can now remove the previous origin check as @vsellier fixes the duplicated origin insertion in the scheduler db in rDSCH0a6aac583adff2c55069c9da676ad95670e35708. anlambert: We can now remove the previous origin check as @vsellier fixes the duplicated origin insertion… | |||||
Done Inline ActionsI've amended D7196 with another commit which drops this as well. ardumont: I've amended D7196 with another commit which drops this as well. | |||||
for repo in page: | assert vcs_type in {"git", "bzr"} | ||||
origin_url = repo.git_https_url | for repo in repos: | ||||
origin_url = origin(vcs_type, repo) | |||||
# filter out origins with invalid URL or origin previously listed | # filter out origins with invalid URL or origin previously listed | ||||
# (last modified repository will be listed twice by launchpadlib) | # (last modified repository will be listed twice by launchpadlib) | ||||
if not origin_url.startswith("https://") or origin_url == prev_origin_url: | if ( | ||||
not origin_url.startswith("https://") | |||||
or origin_url == prev_origin_url[vcs_type] | |||||
): | |||||
continue | continue | ||||
last_update = repo.date_last_modified | last_update = repo.date_last_modified | ||||
self.date_last_modified = last_update | self.date_last_modified[vcs_type] = last_update | ||||
logger.debug("Found origin %s last updated on %s", origin_url, last_update) | logger.debug( | ||||
"Found origin %s with type %s last updated on %s", | |||||
origin_url, | |||||
vcs_type, | |||||
last_update, | |||||
) | |||||
prev_origin_url = origin_url | prev_origin_url[vcs_type] = origin_url | ||||
yield ListedOrigin( | yield ListedOrigin( | ||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||
visit_type="git", | visit_type=vcs_type, | ||||
url=origin_url, | url=origin_url, | ||||
last_update=last_update, | last_update=last_update, | ||||
) | ) | ||||
def finalize(self) -> None: | def finalize(self) -> None: | ||||
if self.date_last_modified is None: | git_date_last_modified = self.date_last_modified["git"] | ||||
bzr_date_last_modified = self.date_last_modified["bzr"] | |||||
if git_date_last_modified is None and bzr_date_last_modified is None: | |||||
return | return | ||||
if self.incremental and ( | if self.incremental and ( | ||||
self.state.date_last_modified is None | self.state.git_date_last_modified is None | ||||
or self.date_last_modified > self.state.date_last_modified | or ( | ||||
git_date_last_modified is not None | |||||
and git_date_last_modified > self.state.git_date_last_modified | |||||
) | |||||
): | |||||
self.state.git_date_last_modified = git_date_last_modified | |||||
if self.incremental and ( | |||||
self.state.bzr_date_last_modified is None | |||||
or ( | |||||
bzr_date_last_modified is not None | |||||
and bzr_date_last_modified > self.state.bzr_date_last_modified | |||||
) | |||||
): | ): | ||||
self.state.date_last_modified = self.date_last_modified | self.state.bzr_date_last_modified = self.date_last_modified["bzr"] | ||||
self.updated = True | self.updated = True |
That means either i'll reset the state in the scheduling db or i'll alter the data when deploying this.