diff --git a/swh/lister/cgit/__init__.py b/swh/lister/cgit/__init__.py --- a/swh/lister/cgit/__init__.py +++ b/swh/lister/cgit/__init__.py @@ -1,14 +1,12 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def register(): from .lister import CGitLister - from .models import CGitModel return { - "models": [CGitModel], "lister": CGitLister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -1,26 +1,25 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging -import re -from typing import Any, Dict, Generator, Optional +from typing import Iterator, List, Optional from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup -from requests import Session -from requests.adapters import HTTPAdapter +import requests -from swh.core.utils import grouper from swh.lister import USER_AGENT -from swh.lister.core.lister_base import ListerBase - -from .models import CGitModel +from swh.lister.pattern import StatelessLister +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) +Repositories = List[str] + -class CGitLister(ListerBase): +class CGitLister(StatelessLister[Repositories]): """Lister class for CGit repositories. This lister will retrieve the list of published git repositories by @@ -31,31 +30,12 @@ URL for that git repo. If several "Clone" urls are provided, prefer the http/https one, if - any, otherwise fall bak to the first one. - - A loader task is created for each git repository:: - - Task: - Type: load-git - Policy: recurring - Args: - - - Example:: - - Task: - Type: load-git - Policy: recurring - Args: - 'https://git.savannah.gnu.org/git/elisp-es.git' + any, otherwise fallback to the first one. """ - MODEL = CGitModel - DEFAULT_URL = "https://git.savannah.gnu.org/cgit/" LISTER_NAME = "cgit" - url_prefix_present = True - def __init__(self, url=None, instance=None, override_config=None): + def __init__(self, scheduler: SchedulerInterface, url=None, instance=None): """Lister class for CGit repositories. Args: @@ -65,49 +45,43 @@ if unset. """ - super().__init__(override_config=override_config) - - if url is None: - url = self.config.get("url", self.DEFAULT_URL) - self.url = url - if not instance: instance = urlparse(url).hostname - self.instance = instance - self.session = Session() - self.session.mount(self.url, HTTPAdapter(max_retries=3)) - self.session.headers = { - "User-Agent": USER_AGENT, - } - - def run(self) -> Dict[str, str]: - status = "uneventful" - total = 0 - for repos in grouper(self.get_repos(), 10): - models = list(filter(None, (self.build_model(repo) for repo in repos))) - injected_repos = self.inject_repo_data_into_db(models) - self.schedule_missing_tasks(models, injected_repos) - self.db_session.commit() - total += len(injected_repos) - logger.debug("Scheduled %s tasks for %s", total, self.url) - status = "eventful" - - return {"status": status} - - def get_repos(self) -> Generator[str, None, None]: + + super().__init__( + scheduler=scheduler, credentials=None, url=url, instance=instance, + ) + + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/html", "User-Agent": USER_AGENT} + ) + + def _get_and_parse(self, url: str) -> BeautifulSoup: + """Get the given url and parse the retrieved HTML using BeautifulSoup""" + response = self.session.get(url) + response.raise_for_status() + return BeautifulSoup(response.text, features="html.parser") + + def get_pages(self) -> Iterator[Repositories]: """Generate git 'project' URLs found on the current CGit server """ - next_page = self.url + next_page: Optional[str] = self.url while next_page: - bs_idx = self.get_and_parse(next_page) + bs_idx = self._get_and_parse(next_page) + page_results = [] + for tr in bs_idx.find("div", {"class": "content"}).find_all( "tr", {"class": ""} ): - yield urljoin(self.url, tr.find("a")["href"]) + page_results.append(urljoin(self.url, tr.find("a")["href"])) + + yield page_results try: pager = bs_idx.find("ul", {"class": "pager"}) + current_page = pager.find("a", {"class": "current"}) if current_page: next_page = current_page.parent.next_sibling.a["href"] @@ -116,11 +90,33 @@ # no pager, or no next page next_page = None - def build_model(self, repo_url: str) -> Optional[Dict[str, Any]]: - """Given the URL of a git repo project page on a CGit server, - return the repo description (dict) suitable for insertion in the db. - """ - bs = self.get_and_parse(repo_url) + def get_origins_from_page( + self, repositories: Repositories + ) -> Iterator[ListedOrigin]: + """Convert a page of cgit repositories into a list of ListedOrigins.""" + assert self.lister_obj.id is not None + + for repository_url in repositories: + origin_url = self._get_origin_from_repository_url(repository_url) + if not origin_url: + continue + + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="git", + last_update=None, + ) + + def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: + """Extract the git url from the repository page""" + bs = self._get_and_parse(repository_url) + + # origin urls are listed on the repository page + # TODO check if forcing https is better or not ? + # + # + # urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] if not urls: @@ -134,15 +130,4 @@ else: # otherwise, choose the first one origin_url = urls[0] - - return { - "uid": repo_url, - "name": bs.find("a", title=re.compile(".+"))["title"], - "origin_type": "git", - "instance": self.instance, - "origin_url": origin_url, - } - - def get_and_parse(self, url: str) -> BeautifulSoup: - "Get the given url and parse the retrieved HTML using BeautifulSoup" - return BeautifulSoup(self.session.get(url).text, features="html.parser") + return origin_url diff --git a/swh/lister/cgit/models.py b/swh/lister/cgit/models.py deleted file mode 100644 --- a/swh/lister/cgit/models.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (C) 2019 the Software Heritage developers -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from sqlalchemy import Column, String - -from ..core.models import ModelBase - - -class CGitModel(ModelBase): - """a CGit repository representation - - """ - - __tablename__ = "cgit_repo" - - uid = Column(String, primary_key=True) - instance = Column(String, index=True) diff --git a/swh/lister/cgit/tasks.py b/swh/lister/cgit/tasks.py --- a/swh/lister/cgit/tasks.py +++ b/swh/lister/cgit/tasks.py @@ -1,16 +1,19 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Dict, Optional + from celery import shared_task from .lister import CGitLister @shared_task(name=__name__ + ".CGitListerTask") -def list_cgit(**lister_args): +def list_cgit(url: str, instance: Optional[str] = None,) -> Dict[str, str]: """Lister task for CGit instances""" - return CGitLister(**lister_args).run() + lister = CGitLister.from_configfile(url=url, instance=instance) + return lister.run().dict() @shared_task(name=__name__ + ".ping") diff --git a/swh/lister/cgit/tests/conftest.py b/swh/lister/cgit/tests/conftest.py deleted file mode 100644 --- a/swh/lister/cgit/tests/conftest.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (C) 2019-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import pytest - - -@pytest.fixture -def lister_under_test(): - return "cgit" - - -@pytest.fixture -def lister_cgit(swh_lister): - for task_type in [ - { - "type": "load-git", - "description": "Load git repository", - "backend_name": "swh.loader.git.tasks.UpdateGitRepository", - "default_interval": "1 day", - }, - ]: - swh_lister.scheduler.create_task_type(task_type) - - return swh_lister diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py --- a/swh/lister/cgit/tests/test_lister.py +++ b/swh/lister/cgit/tests/test_lister.py @@ -1,71 +1,65 @@ -# Copyright (C) 2019-2020 the Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import List + from swh.lister import __version__ +from swh.lister.cgit.lister import CGitLister +from swh.lister.pattern import ListerStats -def test_lister_cgit_no_page(requests_mock_datadir, lister_cgit): - assert lister_cgit.url == "https://git.savannah.gnu.org/cgit/" +def test_lister_cgit_get_pages_one_page(requests_mock_datadir, swh_scheduler): + url = "https://git.savannah.gnu.org/cgit/" + lister_cgit = CGitLister(swh_scheduler, url=url) - repos = list(lister_cgit.get_repos()) - assert len(repos) == 977 + repos: List[List[str]] = list(lister_cgit.get_pages()) + flattened_repos = sum(repos, []) + assert len(flattened_repos) == 977 - assert repos[0] == "https://git.savannah.gnu.org/cgit/elisp-es.git/" + assert flattened_repos[0] == "https://git.savannah.gnu.org/cgit/elisp-es.git/" # note the url below is NOT a subpath of /cgit/ - assert repos[-1] == "https://git.savannah.gnu.org/path/to/yetris.git/" # noqa + assert ( + flattened_repos[-1] == "https://git.savannah.gnu.org/path/to/yetris.git/" + ) # noqa # note the url below is NOT on the same server - assert repos[-2] == "http://example.org/cgit/xstarcastle.git/" - - -def test_lister_cgit_model(requests_mock_datadir, lister_cgit): - repo = next(lister_cgit.get_repos()) + assert flattened_repos[-2] == "http://example.org/cgit/xstarcastle.git/" - model = lister_cgit.build_model(repo) - assert model == { - "uid": "https://git.savannah.gnu.org/cgit/elisp-es.git/", - "name": "elisp-es.git", - "origin_type": "git", - "instance": "git.savannah.gnu.org", - "origin_url": "https://git.savannah.gnu.org/git/elisp-es.git", - } +def test_lister_cgit_get_pages_with_pages(requests_mock_datadir, swh_scheduler): + url = "https://git.tizen/cgit/" + lister_cgit = CGitLister(swh_scheduler, url=url) -def test_lister_cgit_with_pages(requests_mock_datadir, lister_cgit): - lister_cgit.url = "https://git.tizen/cgit/" - - repos = list(lister_cgit.get_repos()) + repos: List[List[str]] = list(lister_cgit.get_pages()) + flattened_repos = sum(repos, []) # we should have 16 repos (listed on 3 pages) - assert len(repos) == 16 - + assert len(repos) == 3 + assert len(flattened_repos) == 16 -def test_lister_cgit_run(requests_mock_datadir, lister_cgit): - lister_cgit.url = "https://git.tizen/cgit/" - lister_cgit.run() - r = lister_cgit.scheduler.search_tasks(task_type="load-git") - assert len(r) == 16 +def test_lister_cgit_run(requests_mock_datadir, swh_scheduler): + """cgit lister supports pagination""" - for row in r: - assert row["type"] == "load-git" - # arguments check - args = row["arguments"]["args"] - assert len(args) == 0 + url = "https://git.tizen/cgit/" + lister_cgit = CGitLister(swh_scheduler, url=url) - # kwargs - kwargs = row["arguments"]["kwargs"] - assert len(kwargs) == 1 - url = kwargs["url"] - assert url.startswith("https://git.tizen") + stats = lister_cgit.run() - assert row["policy"] == "recurring" - assert row["priority"] is None + expected_nb_origins = 16 + assert stats == ListerStats(pages=3, origins=expected_nb_origins) + # test page parsing + scheduler_origins = swh_scheduler.get_listed_origins( + lister_cgit.lister_obj.id + ).origins + assert len(scheduler_origins) == expected_nb_origins -def test_lister_cgit_requests(requests_mock_datadir, lister_cgit): - lister_cgit.url = "https://git.tizen/cgit/" - lister_cgit.run() + # test listed repositories + for listed_origin in scheduler_origins: + assert listed_origin.visit_type == "git" + assert listed_origin.url.startswith("https://git.tizen") + # test user agent content assert len(requests_mock_datadir.request_history) != 0 for request in requests_mock_datadir.request_history: assert "User-Agent" in request.headers diff --git a/swh/lister/cgit/tests/test_tasks.py b/swh/lister/cgit/tests/test_tasks.py --- a/swh/lister/cgit/tests/test_tasks.py +++ b/swh/lister/cgit/tests/test_tasks.py @@ -1,12 +1,14 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from unittest.mock import patch +from swh.lister.pattern import ListerStats -def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + +def test_cgit_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): res = swh_scheduler_celery_app.send_task("swh.lister.cgit.tasks.ping") assert res res.wait() @@ -15,19 +17,21 @@ @patch("swh.lister.cgit.tasks.CGitLister") -def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): +def test_cgit_lister_task( + lister, swh_scheduler_celery_app, swh_scheduler_celery_worker +): # setup the mocked CGitLister - lister.return_value = lister - lister.run.return_value = None + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict(url="https://git.kernel.org/", instance="kernel") res = swh_scheduler_celery_app.send_task( - "swh.lister.cgit.tasks.CGitListerTask", - kwargs=dict(url="https://git.kernel.org/", instance="kernel"), + "swh.lister.cgit.tasks.CGitListerTask", kwargs=kwargs, ) assert res res.wait() assert res.successful() - lister.assert_called_once_with(url="https://git.kernel.org/", instance="kernel") - lister.db_last_index.assert_not_called() + lister.from_configfile.assert_called_once_with(**kwargs) lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -39,35 +39,3 @@ **lister_args.get(lister_name, {}), ) assert hasattr(lst, "run") - - -def test_get_lister_override(): - """Overriding the lister configuration should populate its config - - """ - db_url = init_db().url() - - listers = { - "cgit": "https://some.where/cgit", - } - - # check the override ends up defined in the lister - for lister_name, url in listers.items(): - lst = get_lister( - lister_name, db_url, url=url, priority="high", policy="oneshot" - ) - - assert lst.url == url - assert lst.config["priority"] == "high" - assert lst.config["policy"] == "oneshot" - - # check the default urls are used and not the override (since it's not - # passed) - for lister_name, url in listers.items(): - lst = get_lister(lister_name, db_url) - - # no override so this does not end up in lister's configuration - assert "url" not in lst.config - assert "priority" not in lst.config - assert "oneshot" not in lst.config - assert lst.url == lst.DEFAULT_URL