diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -58,6 +58,7 @@ lister.arch=swh.lister.arch:register lister.aur=swh.lister.aur:register lister.bitbucket=swh.lister.bitbucket:register + lister.bower=swh.lister.bower:register lister.cgit=swh.lister.cgit:register lister.cran=swh.lister.cran:register lister.crates=swh.lister.crates:register diff --git a/swh/lister/bower/__init__.py b/swh/lister/bower/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/bower/__init__.py @@ -0,0 +1,76 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +Bower lister +============ + +The `Bower`_ lister list origins from its packages registry `registry.bower.io`_. + +Bower is a tool to manage Javascript packages. + +The registry provide an `http api`_ from where the lister retrieve package names +and url. + +As of August 2022 `registry.bower.io`_ list 71028 package names. + +Note that even if the project is still maintained(security fixes, no new features), it is +recommended to not use it anymore and prefer Yarn as a replacement since 2018. + +Origins retrieving strategy +--------------------------- + +To get a list of all package names we call `https://registry.bower.io/packages` endpoint. +There is no other way for discovery (no archive index, no database dump, no dvcs repository). + +Page listing +------------ + +There is only one page that list all origins url. + +Origins from page +----------------- + +The lister yields all origins url from one page. It is a list of package name and url. +Origins url corresponds to Git repository url. +Bower is supposed to support Svn repository too but on +/- 71000 urls I have only found 35 +urls that may not be Git repository. + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory:: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/bower/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment:: + + docker-compose up -d + +Then connect to the lister:: + + docker exec -it docker_swh-lister_1 bash + +And run the lister (The output of this listing results in “oneshot” tasks in the scheduler):: + + swh lister run -l bower + +.. _Bower: https://bower.io +.. _registry.bower.io: https://registry.bower.io +.. _http api: https://registry.bower.io/packages +""" + + +def register(): + from .lister import BowerLister + + return { + "lister": BowerLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/bower/lister.py b/swh/lister/bower/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/bower/lister.py @@ -0,0 +1,91 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import logging +from typing import Any, Dict, Iterator, List, Optional + +import requests +from tenacity.before_sleep import before_sleep_log + +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +BowerListerPage = List[Dict[str, str]] + + +class BowerLister(StatelessLister[BowerListerPage]): + """List Bower (Javascript package manager) origins.""" + + LISTER_NAME = "bower" + VISIT_TYPE = "bower" + INSTANCE = "bower" + + API_URL = "https://registry.bower.io/packages" + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=self.API_URL, + ) + self.session = requests.Session() + self.session.headers.update( + { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + ) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response: + + logger.info("Fetching URL %s with params %s", url, params) + + response = self.session.get(url, params=params) + if response.status_code != 200: + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + response.raise_for_status() + + return response + + def get_pages(self) -> Iterator[BowerListerPage]: + """Yield an iterator which returns 'page' + + It uses the api endpoint provided by `https://registry.bower.io/packages` + to get a list of package names with an origin url that corresponds to Git + repository. + + There is only one page that list all origins urls. + """ + response = self.page_request(url=self.url, params={}) + yield response.json() + + def get_origins_from_page(self, page: BowerListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances.""" + assert self.lister_obj.id is not None + + for entry in page: + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=entry["url"], + last_update=None, + ) diff --git a/swh/lister/bower/tasks.py b/swh/lister/bower/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/bower/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.bower.lister import BowerLister + + +@shared_task(name=__name__ + ".BowerListerTask") +def list_bower(**lister_args): + """Lister task for Bower (Javascript package manager) registry""" + return BowerLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/bower/tests/__init__.py b/swh/lister/bower/tests/__init__.py new file mode 100644 diff --git a/swh/lister/bower/tests/data/https_registry.bower.io/packages b/swh/lister/bower/tests/data/https_registry.bower.io/packages new file mode 100644 --- /dev/null +++ b/swh/lister/bower/tests/data/https_registry.bower.io/packages @@ -0,0 +1,14 @@ +[ + { + "name": "font-awesome", + "url": "https://github.com/FortAwesome/Font-Awesome.git" + }, + { + "name": "redux", + "url": "https://github.com/reactjs/redux.git" + }, + { + "name": "vue", + "url": "https://github.com/vuejs/vue.git" + } +] diff --git a/swh/lister/bower/tests/test_lister.py b/swh/lister/bower/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/bower/tests/test_lister.py @@ -0,0 +1,37 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from swh.lister.bower.lister import BowerLister + +expected_origins = [ + {"name": "font-awesome", "url": "https://github.com/FortAwesome/Font-Awesome.git"}, + {"name": "redux", "url": "https://github.com/reactjs/redux.git"}, + {"name": "vue", "url": "https://github.com/vuejs/vue.git"}, +] + + +def test_bower_lister(datadir, requests_mock_datadir, swh_scheduler): + lister = BowerLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 1 + assert res.origins == 1 + 1 + 1 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert len(scheduler_origins) == len(expected_origins) + + assert { + ( + scheduled.visit_type, + scheduled.url, + ) + for scheduled in scheduler_origins + } == { + ( + "bower", + expected["url"], + ) + for expected in expected_origins + } diff --git a/swh/lister/bower/tests/test_tasks.py b/swh/lister/bower/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/bower/tests/test_tasks.py @@ -0,0 +1,31 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_bower_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.bower.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_bower_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked BowerLister + lister = mocker.patch("swh.lister.bower.tasks.BowerLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.bower.tasks.BowerListerTask") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with()