diff --git a/swh/lister/cgit/__init__.py b/swh/lister/cgit/__init__.py index 50cc8b5..be99de4 100644 --- a/swh/lister/cgit/__init__.py +++ b/swh/lister/cgit/__init__.py @@ -1,14 +1,12 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def register(): from .lister import CGitLister - from .models import CGitModel return { - "models": [CGitModel], "lister": CGitLister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py index 6b8484d..8a92463 100644 --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -1,148 +1,133 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging -import re -from typing import Any, Dict, Generator, Optional +from typing import Iterator, List, Optional from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup -from requests import Session -from requests.adapters import HTTPAdapter +import requests -from swh.core.utils import grouper from swh.lister import USER_AGENT -from swh.lister.core.lister_base import ListerBase - -from .models import CGitModel +from swh.lister.pattern import StatelessLister +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) +Repositories = List[str] + -class CGitLister(ListerBase): +class CGitLister(StatelessLister[Repositories]): """Lister class for CGit repositories. This lister will retrieve the list of published git repositories by parsing the HTML page(s) of the index retrieved at `url`. For each found git repository, a query is made at the given url found in this index to gather published "Clone" URLs to be used as origin URL for that git repo. If several "Clone" urls are provided, prefer the http/https one, if - any, otherwise fall bak to the first one. - - A loader task is created for each git repository:: - - Task: - Type: load-git - Policy: recurring - Args: - - - Example:: - - Task: - Type: load-git - Policy: recurring - Args: - 'https://git.savannah.gnu.org/git/elisp-es.git' + any, otherwise fallback to the first one. """ - MODEL = CGitModel - DEFAULT_URL = "https://git.savannah.gnu.org/cgit/" LISTER_NAME = "cgit" - url_prefix_present = True - def __init__(self, url=None, instance=None, override_config=None): + def __init__(self, scheduler: SchedulerInterface, url=None, instance=None): """Lister class for CGit repositories. Args: url (str): main URL of the CGit instance, i.e. url of the index of published git repositories on this instance. instance (str): Name of cgit instance. Defaults to url's hostname if unset. """ - super().__init__(override_config=override_config) - - if url is None: - url = self.config.get("url", self.DEFAULT_URL) - self.url = url - if not instance: instance = urlparse(url).hostname - self.instance = instance - self.session = Session() - self.session.mount(self.url, HTTPAdapter(max_retries=3)) - self.session.headers = { - "User-Agent": USER_AGENT, - } - - def run(self) -> Dict[str, str]: - status = "uneventful" - total = 0 - for repos in grouper(self.get_repos(), 10): - models = list(filter(None, (self.build_model(repo) for repo in repos))) - injected_repos = self.inject_repo_data_into_db(models) - self.schedule_missing_tasks(models, injected_repos) - self.db_session.commit() - total += len(injected_repos) - logger.debug("Scheduled %s tasks for %s", total, self.url) - status = "eventful" - - return {"status": status} - - def get_repos(self) -> Generator[str, None, None]: + + super().__init__( + scheduler=scheduler, credentials=None, url=url, instance=instance, + ) + + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/html", "User-Agent": USER_AGENT} + ) + + def _get_and_parse(self, url: str) -> BeautifulSoup: + """Get the given url and parse the retrieved HTML using BeautifulSoup""" + response = self.session.get(url) + response.raise_for_status() + return BeautifulSoup(response.text, features="html.parser") + + def get_pages(self) -> Iterator[Repositories]: """Generate git 'project' URLs found on the current CGit server """ - next_page = self.url + next_page: Optional[str] = self.url while next_page: - bs_idx = self.get_and_parse(next_page) + bs_idx = self._get_and_parse(next_page) + page_results = [] + for tr in bs_idx.find("div", {"class": "content"}).find_all( "tr", {"class": ""} ): - yield urljoin(self.url, tr.find("a")["href"]) + page_results.append(urljoin(self.url, tr.find("a")["href"])) + + yield page_results try: pager = bs_idx.find("ul", {"class": "pager"}) + current_page = pager.find("a", {"class": "current"}) if current_page: next_page = current_page.parent.next_sibling.a["href"] next_page = urljoin(self.url, next_page) except (AttributeError, KeyError): # no pager, or no next page next_page = None - def build_model(self, repo_url: str) -> Optional[Dict[str, Any]]: - """Given the URL of a git repo project page on a CGit server, - return the repo description (dict) suitable for insertion in the db. - """ - bs = self.get_and_parse(repo_url) + def get_origins_from_page( + self, repositories: Repositories + ) -> Iterator[ListedOrigin]: + """Convert a page of cgit repositories into a list of ListedOrigins.""" + assert self.lister_obj.id is not None + + for repository_url in repositories: + origin_url = self._get_origin_from_repository_url(repository_url) + if not origin_url: + continue + + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="git", + last_update=None, + ) + + def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: + """Extract the git url from the repository page""" + bs = self._get_and_parse(repository_url) + + # origin urls are listed on the repository page + # TODO check if forcing https is better or not ? + # + # + # urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] if not urls: return None # look for the http/https url, if any, and use it as origin_url for url in urls: if urlparse(url).scheme in ("http", "https"): origin_url = url break else: # otherwise, choose the first one origin_url = urls[0] - - return { - "uid": repo_url, - "name": bs.find("a", title=re.compile(".+"))["title"], - "origin_type": "git", - "instance": self.instance, - "origin_url": origin_url, - } - - def get_and_parse(self, url: str) -> BeautifulSoup: - "Get the given url and parse the retrieved HTML using BeautifulSoup" - return BeautifulSoup(self.session.get(url).text, features="html.parser") + return origin_url diff --git a/swh/lister/cgit/models.py b/swh/lister/cgit/models.py deleted file mode 100644 index 61bc545..0000000 --- a/swh/lister/cgit/models.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (C) 2019 the Software Heritage developers -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from sqlalchemy import Column, String - -from ..core.models import ModelBase - - -class CGitModel(ModelBase): - """a CGit repository representation - - """ - - __tablename__ = "cgit_repo" - - uid = Column(String, primary_key=True) - instance = Column(String, index=True) diff --git a/swh/lister/cgit/tasks.py b/swh/lister/cgit/tasks.py index 2e41133..e6ef159 100644 --- a/swh/lister/cgit/tasks.py +++ b/swh/lister/cgit/tasks.py @@ -1,18 +1,21 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Dict, Optional + from celery import shared_task from .lister import CGitLister @shared_task(name=__name__ + ".CGitListerTask") -def list_cgit(**lister_args): +def list_cgit(url: str, instance: Optional[str] = None,) -> Dict[str, str]: """Lister task for CGit instances""" - return CGitLister(**lister_args).run() + lister = CGitLister.from_configfile(url=url, instance=instance) + return lister.run().dict() @shared_task(name=__name__ + ".ping") def _ping(): return "OK" diff --git a/swh/lister/cgit/tests/conftest.py b/swh/lister/cgit/tests/conftest.py deleted file mode 100644 index d299271..0000000 --- a/swh/lister/cgit/tests/conftest.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (C) 2019-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import pytest - - -@pytest.fixture -def lister_under_test(): - return "cgit" - - -@pytest.fixture -def lister_cgit(swh_lister): - for task_type in [ - { - "type": "load-git", - "description": "Load git repository", - "backend_name": "swh.loader.git.tasks.UpdateGitRepository", - "default_interval": "1 day", - }, - ]: - swh_lister.scheduler.create_task_type(task_type) - - return swh_lister diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py index 197a1d7..83b393f 100644 --- a/swh/lister/cgit/tests/test_lister.py +++ b/swh/lister/cgit/tests/test_lister.py @@ -1,74 +1,68 @@ -# Copyright (C) 2019-2020 the Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import List + from swh.lister import __version__ +from swh.lister.cgit.lister import CGitLister +from swh.lister.pattern import ListerStats -def test_lister_cgit_no_page(requests_mock_datadir, lister_cgit): - assert lister_cgit.url == "https://git.savannah.gnu.org/cgit/" +def test_lister_cgit_get_pages_one_page(requests_mock_datadir, swh_scheduler): + url = "https://git.savannah.gnu.org/cgit/" + lister_cgit = CGitLister(swh_scheduler, url=url) - repos = list(lister_cgit.get_repos()) - assert len(repos) == 977 + repos: List[List[str]] = list(lister_cgit.get_pages()) + flattened_repos = sum(repos, []) + assert len(flattened_repos) == 977 - assert repos[0] == "https://git.savannah.gnu.org/cgit/elisp-es.git/" + assert flattened_repos[0] == "https://git.savannah.gnu.org/cgit/elisp-es.git/" # note the url below is NOT a subpath of /cgit/ - assert repos[-1] == "https://git.savannah.gnu.org/path/to/yetris.git/" # noqa + assert ( + flattened_repos[-1] == "https://git.savannah.gnu.org/path/to/yetris.git/" + ) # noqa # note the url below is NOT on the same server - assert repos[-2] == "http://example.org/cgit/xstarcastle.git/" - - -def test_lister_cgit_model(requests_mock_datadir, lister_cgit): - repo = next(lister_cgit.get_repos()) + assert flattened_repos[-2] == "http://example.org/cgit/xstarcastle.git/" - model = lister_cgit.build_model(repo) - assert model == { - "uid": "https://git.savannah.gnu.org/cgit/elisp-es.git/", - "name": "elisp-es.git", - "origin_type": "git", - "instance": "git.savannah.gnu.org", - "origin_url": "https://git.savannah.gnu.org/git/elisp-es.git", - } +def test_lister_cgit_get_pages_with_pages(requests_mock_datadir, swh_scheduler): + url = "https://git.tizen/cgit/" + lister_cgit = CGitLister(swh_scheduler, url=url) -def test_lister_cgit_with_pages(requests_mock_datadir, lister_cgit): - lister_cgit.url = "https://git.tizen/cgit/" - - repos = list(lister_cgit.get_repos()) + repos: List[List[str]] = list(lister_cgit.get_pages()) + flattened_repos = sum(repos, []) # we should have 16 repos (listed on 3 pages) - assert len(repos) == 16 - + assert len(repos) == 3 + assert len(flattened_repos) == 16 -def test_lister_cgit_run(requests_mock_datadir, lister_cgit): - lister_cgit.url = "https://git.tizen/cgit/" - lister_cgit.run() - r = lister_cgit.scheduler.search_tasks(task_type="load-git") - assert len(r) == 16 +def test_lister_cgit_run(requests_mock_datadir, swh_scheduler): + """cgit lister supports pagination""" - for row in r: - assert row["type"] == "load-git" - # arguments check - args = row["arguments"]["args"] - assert len(args) == 0 + url = "https://git.tizen/cgit/" + lister_cgit = CGitLister(swh_scheduler, url=url) - # kwargs - kwargs = row["arguments"]["kwargs"] - assert len(kwargs) == 1 - url = kwargs["url"] - assert url.startswith("https://git.tizen") + stats = lister_cgit.run() - assert row["policy"] == "recurring" - assert row["priority"] is None + expected_nb_origins = 16 + assert stats == ListerStats(pages=3, origins=expected_nb_origins) + # test page parsing + scheduler_origins = swh_scheduler.get_listed_origins( + lister_cgit.lister_obj.id + ).origins + assert len(scheduler_origins) == expected_nb_origins -def test_lister_cgit_requests(requests_mock_datadir, lister_cgit): - lister_cgit.url = "https://git.tizen/cgit/" - lister_cgit.run() + # test listed repositories + for listed_origin in scheduler_origins: + assert listed_origin.visit_type == "git" + assert listed_origin.url.startswith("https://git.tizen") + # test user agent content assert len(requests_mock_datadir.request_history) != 0 for request in requests_mock_datadir.request_history: assert "User-Agent" in request.headers user_agent = request.headers["User-Agent"] assert "Software Heritage Lister" in user_agent assert __version__ in user_agent diff --git a/swh/lister/cgit/tests/test_tasks.py b/swh/lister/cgit/tests/test_tasks.py index c7d2fe3..17d34a9 100644 --- a/swh/lister/cgit/tests/test_tasks.py +++ b/swh/lister/cgit/tests/test_tasks.py @@ -1,33 +1,37 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from unittest.mock import patch +from swh.lister.pattern import ListerStats -def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + +def test_cgit_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): res = swh_scheduler_celery_app.send_task("swh.lister.cgit.tasks.ping") assert res res.wait() assert res.successful() assert res.result == "OK" @patch("swh.lister.cgit.tasks.CGitLister") -def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): +def test_cgit_lister_task( + lister, swh_scheduler_celery_app, swh_scheduler_celery_worker +): # setup the mocked CGitLister - lister.return_value = lister - lister.run.return_value = None + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict(url="https://git.kernel.org/", instance="kernel") res = swh_scheduler_celery_app.send_task( - "swh.lister.cgit.tasks.CGitListerTask", - kwargs=dict(url="https://git.kernel.org/", instance="kernel"), + "swh.lister.cgit.tasks.CGitListerTask", kwargs=kwargs, ) assert res res.wait() assert res.successful() - lister.assert_called_once_with(url="https://git.kernel.org/", instance="kernel") - lister.db_last_index.assert_not_called() + lister.from_configfile.assert_called_once_with(**kwargs) lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py index cf8de63..cc076bc 100644 --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -1,73 +1,41 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.lister.cli import SUPPORTED_LISTERS, get_lister from .test_utils import init_db lister_args = { "phabricator": { "instance": "softwareheritage", "url": "https://forge.softwareheritage.org/api/diffusion.repository.search", "api_token": "bogus", }, } def test_get_lister_wrong_input(): """Unsupported lister should raise""" with pytest.raises(ValueError) as e: get_lister("unknown", "db-url") assert "Invalid lister" in str(e.value) def test_get_lister(swh_scheduler_config): """Instantiating a supported lister should be ok """ db_url = init_db().url() for lister_name in SUPPORTED_LISTERS: lst = get_lister( lister_name, db_url, scheduler={"cls": "local", **swh_scheduler_config}, **lister_args.get(lister_name, {}), ) assert hasattr(lst, "run") - - -def test_get_lister_override(): - """Overriding the lister configuration should populate its config - - """ - db_url = init_db().url() - - listers = { - "cgit": "https://some.where/cgit", - } - - # check the override ends up defined in the lister - for lister_name, url in listers.items(): - lst = get_lister( - lister_name, db_url, url=url, priority="high", policy="oneshot" - ) - - assert lst.url == url - assert lst.config["priority"] == "high" - assert lst.config["policy"] == "oneshot" - - # check the default urls are used and not the override (since it's not - # passed) - for lister_name, url in listers.items(): - lst = get_lister(lister_name, db_url) - - # no override so this does not end up in lister's configuration - assert "url" not in lst.config - assert "priority" not in lst.config - assert "oneshot" not in lst.config - assert lst.url == lst.DEFAULT_URL