Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123756
D8333.id30092.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
9 KB
Subscribers
None
D8333.id30092.diff
View Options
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -58,6 +58,7 @@
lister.arch=swh.lister.arch:register
lister.aur=swh.lister.aur:register
lister.bitbucket=swh.lister.bitbucket:register
+ lister.bower=swh.lister.bower:register
lister.cgit=swh.lister.cgit:register
lister.cran=swh.lister.cran:register
lister.crates=swh.lister.crates:register
diff --git a/swh/lister/bower/__init__.py b/swh/lister/bower/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/bower/__init__.py
@@ -0,0 +1,76 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+"""
+Bower lister
+============
+
+The `Bower`_ lister list origins from its packages registry `registry.bower.io`_.
+
+Bower is a tool to manage Javascript packages.
+
+The registry provide an `http api`_ from where the lister retrieve package names
+and url.
+
+As of August 2022 `registry.bower.io`_ list 71028 package names.
+
+Note that even if the project is still maintained(security fixes, no new features), it is
+recommended to not use it anymore and prefer Yarn as a replacement since 2018.
+
+Origins retrieving strategy
+---------------------------
+
+To get a list of all package names we call `https://registry.bower.io/packages` endpoint.
+There is no other way for discovery (no archive index, no database dump, no dvcs repository).
+
+Page listing
+------------
+
+There is only one page that list all origins url.
+
+Origins from page
+-----------------
+
+The lister yields all origins url from one page. It is a list of package name and url.
+Origins url corresponds to Git repository url.
+Bower is supposed to support Svn repository too but on +/- 71000 urls I have only found 35
+urls that may not be Git repository.
+
+Running tests
+-------------
+
+Activate the virtualenv and run from within swh-lister directory::
+
+ pytest -s -vv --log-cli-level=DEBUG swh/lister/bower/tests
+
+Testing with Docker
+-------------------
+
+Change directory to swh/docker then launch the docker environment::
+
+ docker-compose up -d
+
+Then connect to the lister::
+
+ docker exec -it docker_swh-lister_1 bash
+
+And run the lister (The output of this listing results in “oneshot” tasks in the scheduler)::
+
+ swh lister run -l bower
+
+.. _Bower: https://bower.io
+.. _registry.bower.io: https://registry.bower.io
+.. _http api: https://registry.bower.io/packages
+"""
+
+
+def register():
+ from .lister import BowerLister
+
+ return {
+ "lister": BowerLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/bower/lister.py b/swh/lister/bower/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/bower/lister.py
@@ -0,0 +1,91 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+import logging
+from typing import Any, Dict, Iterator, List, Optional
+
+import requests
+from tenacity.before_sleep import before_sleep_log
+
+from swh.lister.utils import throttling_retry
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from .. import USER_AGENT
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+# Aliasing the page results returned by `get_pages` method from the lister.
+BowerListerPage = List[Dict[str, str]]
+
+
+class BowerLister(StatelessLister[BowerListerPage]):
+ """List Bower (Javascript package manager) origins."""
+
+ LISTER_NAME = "bower"
+ VISIT_TYPE = "bower"
+ INSTANCE = "bower"
+
+ API_URL = "https://registry.bower.io/packages"
+
+ def __init__(
+ self,
+ scheduler: SchedulerInterface,
+ credentials: Optional[CredentialsType] = None,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ credentials=credentials,
+ instance=self.INSTANCE,
+ url=self.API_URL,
+ )
+ self.session = requests.Session()
+ self.session.headers.update(
+ {
+ "Accept": "application/json",
+ "User-Agent": USER_AGENT,
+ }
+ )
+
+ @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
+ def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:
+
+ logger.info("Fetching URL %s with params %s", url, params)
+
+ response = self.session.get(url, params=params)
+ if response.status_code != 200:
+ logger.warning(
+ "Unexpected HTTP status code %s on %s: %s",
+ response.status_code,
+ response.url,
+ response.content,
+ )
+ response.raise_for_status()
+
+ return response
+
+ def get_pages(self) -> Iterator[BowerListerPage]:
+ """Yield an iterator which returns 'page'
+
+ It uses the api endpoint provided by `https://registry.bower.io/packages`
+ to get a list of package names with an origin url that corresponds to Git
+ repository.
+
+ There is only one page that list all origins urls.
+ """
+ response = self.page_request(url=self.url, params={})
+ yield response.json()
+
+ def get_origins_from_page(self, page: BowerListerPage) -> Iterator[ListedOrigin]:
+ """Iterate on all pages and yield ListedOrigin instances."""
+ assert self.lister_obj.id is not None
+
+ for entry in page:
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ visit_type=self.VISIT_TYPE,
+ url=entry["url"],
+ last_update=None,
+ )
diff --git a/swh/lister/bower/tasks.py b/swh/lister/bower/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/bower/tasks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.lister.bower.lister import BowerLister
+
+
+@shared_task(name=__name__ + ".BowerListerTask")
+def list_bower(**lister_args):
+ """Lister task for Bower (Javascript package manager) registry"""
+ return BowerLister.from_configfile(**lister_args).run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping():
+ return "OK"
diff --git a/swh/lister/bower/tests/__init__.py b/swh/lister/bower/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/bower/tests/data/https_registry.bower.io/packages b/swh/lister/bower/tests/data/https_registry.bower.io/packages
new file mode 100644
--- /dev/null
+++ b/swh/lister/bower/tests/data/https_registry.bower.io/packages
@@ -0,0 +1,14 @@
+[
+ {
+ "name": "font-awesome",
+ "url": "https://github.com/FortAwesome/Font-Awesome.git"
+ },
+ {
+ "name": "redux",
+ "url": "https://github.com/reactjs/redux.git"
+ },
+ {
+ "name": "vue",
+ "url": "https://github.com/vuejs/vue.git"
+ }
+]
diff --git a/swh/lister/bower/tests/test_lister.py b/swh/lister/bower/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/bower/tests/test_lister.py
@@ -0,0 +1,37 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+from swh.lister.bower.lister import BowerLister
+
+expected_origins = [
+ {"name": "font-awesome", "url": "https://github.com/FortAwesome/Font-Awesome.git"},
+ {"name": "redux", "url": "https://github.com/reactjs/redux.git"},
+ {"name": "vue", "url": "https://github.com/vuejs/vue.git"},
+]
+
+
+def test_bower_lister(datadir, requests_mock_datadir, swh_scheduler):
+ lister = BowerLister(scheduler=swh_scheduler)
+ res = lister.run()
+
+ assert res.pages == 1
+ assert res.origins == 1 + 1 + 1
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+ assert len(scheduler_origins) == len(expected_origins)
+
+ assert {
+ (
+ scheduled.visit_type,
+ scheduled.url,
+ )
+ for scheduled in scheduler_origins
+ } == {
+ (
+ "bower",
+ expected["url"],
+ )
+ for expected in expected_origins
+ }
diff --git a/swh/lister/bower/tests/test_tasks.py b/swh/lister/bower/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/bower/tests/test_tasks.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_bower_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.bower.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_bower_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ # setup the mocked BowerLister
+ lister = mocker.patch("swh.lister.bower.tasks.BowerLister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=42, origins=42)
+ lister.run.return_value = stats
+
+ res = swh_scheduler_celery_app.send_task("swh.lister.bower.tasks.BowerListerTask")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == stats.dict()
+
+ lister.from_configfile.assert_called_once_with()
+ lister.run.assert_called_once_with()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 19 2024, 11:59 PM (11 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217568
Attached To
D8333: Bower: List origins from registry.bower.io
Event Timeline
Log In to Comment