Differential D4926 Diff 17579 swh/lister/cgit/lister.py

Changeset View

Standalone View

swh/lister/cgit/lister.py

# License: GNU General Public License version 3, or any later version # License: GNU General Public License version 3, or any later version

# See top-level LICENSE file for more information # See top-level LICENSE file for more information

import logging import logging

import re from typing import Iterator, List, Optional

from typing import Any, Dict, Generator, Optional

from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup from bs4 import BeautifulSoup

from requests import Session import requests

from requests.adapters import HTTPAdapter

from swh.core.utils import grouper

from swh.lister import USER_AGENT from swh.lister import USER_AGENT

from swh.lister.core.lister_base import ListerBase from swh.lister.pattern import StatelessLister

from swh.scheduler.interface import SchedulerInterface

from .models import CGitModel from swh.scheduler.model import ListedOrigin

logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

Repositories = List[str]

class CGitLister(ListerBase): class CGitLister(StatelessLister[Repositories]):

"""Lister class for CGit repositories. """Lister class for CGit repositories.

This lister will retrieve the list of published git repositories by This lister will retrieve the list of published git repositories by

parsing the HTML page(s) of the index retrieved at `url`. parsing the HTML page(s) of the index retrieved at `url`.

For each found git repository, a query is made at the given url found For each found git repository, a query is made at the given url found

in this index to gather published "Clone" URLs to be used as origin in this index to gather published "Clone" URLs to be used as origin

URL for that git repo. URL for that git repo.

If several "Clone" urls are provided, prefer the http/https one, if If several "Clone" urls are provided, prefer the http/https one, if

any, otherwise fall bak to the first one. any, otherwise fallback to the first one.

A loader task is created for each git repository::

Task:

Type: load-git

Policy: recurring

Args:

<git_clonable_url>

Example::

Task:

Type: load-git

Policy: recurring

Args:

'https://git.savannah.gnu.org/git/elisp-es.git'

""" """

MODEL = CGitModel

DEFAULT_URL = "https://git.savannah.gnu.org/cgit/"

LISTER_NAME = "cgit" LISTER_NAME = "cgit"

url_prefix_present = True

def __init__(self, url=None, instance=None, override_config=None): def __init__(self, scheduler: SchedulerInterface, url=None, instance=None):

anlambertUnsubmitted

Not Done

missing types for url and instance parameters

anlambert: missing types for `url` and `instance` parameters

tenmaUnsubmitted

Not Done

lister url should not be optional, even if it is required in the task : it cannot function without one.

tenma: lister url should not be optional, even if it is required in the task : it cannot function…

"""Lister class for CGit repositories. """Lister class for CGit repositories.

Args: Args:

url (str): main URL of the CGit instance, i.e. url of the index url (str): main URL of the CGit instance, i.e. url of the index

of published git repositories on this instance. of published git repositories on this instance.

instance (str): Name of cgit instance. Defaults to url's hostname instance (str): Name of cgit instance. Defaults to url's hostname

if unset. if unset.

""" """

super().__init__(override_config=override_config)

if url is None:

url = self.config.get("url", self.DEFAULT_URL)

self.url = url

if not instance: if not instance:

instance = urlparse(url).hostname instance = urlparse(url).hostname

self.instance = instance

self.session = Session()

self.session.mount(self.url, HTTPAdapter(max_retries=3))

self.session.headers = {

"User-Agent": USER_AGENT,

}

def run(self) -> Dict[str, str]:

status = "uneventful"

total = 0

for repos in grouper(self.get_repos(), 10):

models = list(filter(None, (self.build_model(repo) for repo in repos)))

injected_repos = self.inject_repo_data_into_db(models)

self.schedule_missing_tasks(models, injected_repos)

self.db_session.commit()

total += len(injected_repos)

logger.debug("Scheduled %s tasks for %s", total, self.url)

status = "eventful"

return {"status": status} super().__init__(

anlambertUnsubmitted

Not Done

Here instance should not be equal to None as it is used to create the internal lister id.

Two possible solutions to ensure it:

make the instance parameter of the lister and celery tasks non optional
set the instance value to the url one

anlambert: Here `instance` should not be equal to `None` as it is used to create the internal lister id.

anlambertUnsubmitted

Not Done

Sorry, I did not see the instructions above, that comment is not valid.

anlambert: Sorry, I did not see the instructions above, that comment is not valid.

scheduler=scheduler, credentials=None, url=url, instance=instance,

)

self.session = requests.Session()

self.session.headers.update(

{"Accept": "application/html", "User-Agent": USER_AGENT}

)

def _get_and_parse(self, url: str) -> BeautifulSoup:

"""Get the given url and parse the retrieved HTML using BeautifulSoup"""

response = self.session.get(url)

response.raise_for_status()

tenmaUnsubmitted

Not Done

add a warning on any error != than 200 that dumps response content, like the other listers, to ease debugging.

tenma: add a warning on any error != than 200 that dumps response content, like the other listers, to…

tenmaUnsubmitted

Not Done

response = self.session.get(url)

+ if response.status_code != 200:

+ logger.warning(

+ "Unexpected HTTP status code %s on %s: %s",

+ response.status_code,

+ response.url,

+ response.content,

+ )

response.raise_for_status()

return BeautifulSoup(response.text, features="html.parser")

tenma:

return BeautifulSoup(response.text, features="html.parser")

def get_repos(self) -> Generator[str, None, None]: def get_pages(self) -> Iterator[Repositories]:

"""Generate git 'project' URLs found on the current CGit server """Generate git 'project' URLs found on the current CGit server

""" """

next_page = self.url next_page: Optional[str] = self.url

while next_page: while next_page:

bs_idx = self.get_and_parse(next_page) bs_idx = self._get_and_parse(next_page)

page_results = []

for tr in bs_idx.find("div", {"class": "content"}).find_all( for tr in bs_idx.find("div", {"class": "content"}).find_all(

"tr", {"class": ""} "tr", {"class": ""}

): ):

yield urljoin(self.url, tr.find("a")["href"]) page_results.append(urljoin(self.url, tr.find("a")["href"]))

yield page_results

try: try:

pager = bs_idx.find("ul", {"class": "pager"}) pager = bs_idx.find("ul", {"class": "pager"})

current_page = pager.find("a", {"class": "current"}) current_page = pager.find("a", {"class": "current"})

if current_page: if current_page:

next_page = current_page.parent.next_sibling.a["href"] next_page = current_page.parent.next_sibling.a["href"]

next_page = urljoin(self.url, next_page) next_page = urljoin(self.url, next_page)

except (AttributeError, KeyError): except (AttributeError, KeyError):

tenmaUnsubmitted

Not Done

Here I think the lister will stop without error if the site changes, but in this case we want it to fail clearly and not just end.

tenma: Here I think the lister will stop without error if the site changes, but in this case we want…

# no pager, or no next page # no pager, or no next page

next_page = None next_page = None

def build_model(self, repo_url: str) -> Optional[Dict[str, Any]]: def get_origins_from_page(

"""Given the URL of a git repo project page on a CGit server, self, repositories: Repositories

return the repo description (dict) suitable for insertion in the db. ) -> Iterator[ListedOrigin]:

""" """Convert a page of cgit repositories into a list of ListedOrigins."""

bs = self.get_and_parse(repo_url) assert self.lister_obj.id is not None

for repository_url in repositories:

origin_url = self._get_origin_from_repository_url(repository_url)

if not origin_url:

continue

yield ListedOrigin(

lister_id=self.lister_obj.id,

url=origin_url,

visit_type="git",

anlambertUnsubmitted

Not Done

It would be nice to compute a last_update value for the listed origins.

On the main Web UI page of a cgit instance listing hosted repositories, there is an Idle column indicating
when a repository was updated relative to the current date. This should not be complicated to parse
and a datetime object could be produced from it.

anlambert: It would be nice to compute a `last_update` value for the listed origins. On the main Web UI…

anlambertUnsubmitted

Not Done

this is not in a scope of that diff, just throwing the idea here ;-)

anlambert: this is not in a scope of that diff, just throwing the idea here ;-)

vsellierAuthorUnsubmitted

Done

exact, we have changed at iso-functionalities.
I have created the T2988 to not forget it.

vsellier: exact, we have changed at iso-functionalities. I have created the T2988 to not forget it.

last_update=None,

)

def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:

"""Extract the git url from the repository page"""

bs = self._get_and_parse(repository_url)

# origin urls are listed on the repository page

# TODO check if forcing https is better or not ?

# <link rel='vcs-git' href='git://...' title='...'/>

# <link rel='vcs-git' href='http://...' title='...'/>

# <link rel='vcs-git' href='https://...' title='...'/>

urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})]

if not urls: if not urls:

return None return None

# look for the http/https url, if any, and use it as origin_url # look for the http/https url, if any, and use it as origin_url

for url in urls: for url in urls:

if urlparse(url).scheme in ("http", "https"): if urlparse(url).scheme in ("http", "https"):

origin_url = url origin_url = url

break break

else: else:

# otherwise, choose the first one # otherwise, choose the first one

origin_url = urls[0] origin_url = urls[0]

return origin_url

return {

"uid": repo_url,

"name": bs.find("a", title=re.compile(".+"))["title"],

"origin_type": "git",

"instance": self.instance,

"origin_url": origin_url,

}

def get_and_parse(self, url: str) -> BeautifulSoup:

"Get the given url and parse the retrieved HTML using BeautifulSoup"

return BeautifulSoup(self.session.get(url).text, features="html.parser")