diff --git a/swh/lister/puppet/__init__.py b/swh/lister/puppet/__init__.py index e56cee6..3e5e28d 100644 --- a/swh/lister/puppet/__init__.py +++ b/swh/lister/puppet/__init__.py @@ -1,101 +1,108 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """ Puppet lister ============= The Puppet lister list origins from `Puppet Forge`_. Puppet Forge is a package manager for Puppet modules. As of September 2022 `Puppet Forge`_ list 6917 package names. Origins retrieving strategy --------------------------- To get a list of all package names we call an `http api endpoint`_ which have a `getModules`_ endpoint. It returns a paginated list of results and a `next` url. The api follow `OpenApi 3.0 specifications`. Page listing ------------ Each page returns a list of ``results`` which are raw data from api response. The results size is 100 as 100 is the maximum limit size allowed by the api. Origins from page ----------------- The lister yields one hundred origin url per page. Origin url is the html page corresponding to a package name on the forge, following this pattern:: "https://forge.puppet.com/modules/{owner}/{pkgname}" -For each origin `last_update`is set via the module "updated_at" value. +For each origin `last_update` is set via the module "updated_at" value. As the api also returns all existing versions for a package, we build an `artifacts` dict in `extra_loader_arguments` with the archive tarball corresponding to each existing versions. Example for ``file_concat`` module located at https://forge.puppet.com/modules/electrical/file_concat:: { - "artifacts": { - "1.0.0": { - "url": "https://forgeapi.puppet.com/v3/files/electrical-file_concat-1.0.0.tar.gz", # noqa: B950 - "version": "1.0.0", - "filename": "electrical-file_concat-1.0.0.tar.gz", - "last_update": "2015-04-09T12:03:13-07:00", - }, - "1.0.1": { + "artifacts": [ + { "url": "https://forgeapi.puppet.com/v3/files/electrical-file_concat-1.0.1.tar.gz", # noqa: B950 "version": "1.0.1", "filename": "electrical-file_concat-1.0.1.tar.gz", "last_update": "2015-04-17T01:03:46-07:00", + "checksums": { + "md5": "74901a89544134478c2dfde5efbb7f14", + "sha256": "15e973613ea038d8a4f60bafe2d678f88f53f3624c02df3157c0043f4a400de6", # noqa: B950 + }, + }, + { + "url": "https://forgeapi.puppet.com/v3/files/electrical-file_concat-1.0.0.tar.gz", # noqa: B950 + "version": "1.0.0", + "filename": "electrical-file_concat-1.0.0.tar.gz", + "last_update": "2015-04-09T12:03:13-07:00", + "checksums": { + "length": 13289, + }, }, - } + ], } Running tests ------------- Activate the virtualenv and run from within swh-lister directory:: pytest -s -vv --log-cli-level=DEBUG swh/lister/puppet/tests Testing with Docker ------------------- Change directory to swh/docker then launch the docker environment:: docker compose up -d Then schedule a Puppet listing task:: docker compose exec swh-scheduler swh scheduler task add -p oneshot list-puppet You can follow lister execution by displaying logs of swh-lister service:: docker compose logs -f swh-lister .. _Puppet Forge: https://forge.puppet.com/ .. _http api endpoint: https://forgeapi.puppet.com/ .. _getModules: https://forgeapi.puppet.com/#tag/Module-Operations/operation/getModules """ def register(): from .lister import PuppetLister return { "lister": PuppetLister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/puppet/lister.py b/swh/lister/puppet/lister.py index 4982e92..80ac3da 100644 --- a/swh/lister/puppet/lister.py +++ b/swh/lister/puppet/lister.py @@ -1,111 +1,113 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime import logging from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. PuppetListerPage = List[Dict[str, Any]] class PuppetLister(StatelessLister[PuppetListerPage]): """The Puppet lister list origins from 'Puppet Forge'""" LISTER_NAME = "puppet" VISIT_TYPE = "puppet" INSTANCE = "puppet" BASE_URL = "https://forgeapi.puppet.com/" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.BASE_URL, ) def get_pages(self) -> Iterator[PuppetListerPage]: """Yield an iterator which returns 'page' It request the http api endpoint to get a paginated results of modules, and retrieve a `next` url. It ends when `next` json value is `null`. Open Api specification for getModules endpoint: https://forgeapi.puppet.com/#tag/Module-Operations/operation/getModules """ # limit = 100 is the max value for pagination limit: int = 100 response = self.http_request( f"{self.BASE_URL}v3/modules", params={"limit": limit} ) data: Dict[str, Any] = response.json() yield data["results"] while data["pagination"]["next"]: response = self.http_request( urljoin(self.BASE_URL, data["pagination"]["next"]) ) data = response.json() yield data["results"] def get_origins_from_page(self, page: PuppetListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None dt_parse_pattern = "%Y-%m-%d %H:%M:%S %z" for entry in page: last_update = datetime.strptime(entry["updated_at"], dt_parse_pattern) pkgname = entry["name"] owner = entry["owner"]["slug"] url = f"https://forge.puppet.com/modules/{owner}/{pkgname}" - artifacts = {} + artifacts = [] for release in entry["releases"]: # Build an artifact entry following original-artifacts-json specification # https://docs.softwareheritage.org/devel/swh-storage/extrinsic-metadata-specification.html#original-artifacts-json # noqa: B950 checksums = {} if release["version"] == entry["current_release"]["version"]: # checksums are only available for current release for checksum in ("md5", "sha256"): checksums[checksum] = entry["current_release"][ f"file_{checksum}" ] else: # use file length as basic content check instead checksums["length"] = release["file_size"] - artifacts[release["version"]] = { - "filename": release["file_uri"].split("/")[-1], - "url": urljoin(self.BASE_URL, release["file_uri"]), - "version": release["version"], - "last_update": datetime.strptime( - release["created_at"], dt_parse_pattern - ).isoformat(), - "checksums": checksums, - } + artifacts.append( + { + "filename": release["file_uri"].split("/")[-1], + "url": urljoin(self.BASE_URL, release["file_uri"]), + "version": release["version"], + "last_update": datetime.strptime( + release["created_at"], dt_parse_pattern + ).isoformat(), + "checksums": checksums, + } + ) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=url, last_update=last_update, extra_loader_arguments={"artifacts": artifacts}, ) diff --git a/swh/lister/puppet/tests/test_lister.py b/swh/lister/puppet/tests/test_lister.py index 5dbfd89..80e5a63 100644 --- a/swh/lister/puppet/tests/test_lister.py +++ b/swh/lister/puppet/tests/test_lister.py @@ -1,106 +1,120 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.lister.puppet.lister import PuppetLister # flake8: noqa: B950 -expected_origins = { - "https://forge.puppet.com/modules/electrical/file_concat": { - "artifacts": { - "1.0.0": { - "url": "https://forgeapi.puppet.com/v3/files/electrical-file_concat-1.0.0.tar.gz", - "version": "1.0.0", - "filename": "electrical-file_concat-1.0.0.tar.gz", - "last_update": "2015-04-09T12:03:13-07:00", - "checksums": { - "length": 13289, - }, - }, - "1.0.1": { +expected_origins = [ + { + "url": "https://forge.puppet.com/modules/electrical/file_concat", + "artifacts": [ + { "url": "https://forgeapi.puppet.com/v3/files/electrical-file_concat-1.0.1.tar.gz", "version": "1.0.1", "filename": "electrical-file_concat-1.0.1.tar.gz", "last_update": "2015-04-17T01:03:46-07:00", "checksums": { "md5": "74901a89544134478c2dfde5efbb7f14", "sha256": "15e973613ea038d8a4f60bafe2d678f88f53f3624c02df3157c0043f4a400de6", }, }, - } - }, - "https://forge.puppet.com/modules/puppetlabs/puppetdb": { - "artifacts": { - "1.0.0": { - "url": "https://forgeapi.puppet.com/v3/files/puppetlabs-puppetdb-1.0.0.tar.gz", + { + "url": "https://forgeapi.puppet.com/v3/files/electrical-file_concat-1.0.0.tar.gz", "version": "1.0.0", - "filename": "puppetlabs-puppetdb-1.0.0.tar.gz", - "last_update": "2012-09-19T16:51:22-07:00", - "checksums": { - "length": 16336, - }, - }, - "7.9.0": { - "url": "https://forgeapi.puppet.com/v3/files/puppetlabs-puppetdb-7.9.0.tar.gz", - "version": "7.9.0", - "filename": "puppetlabs-puppetdb-7.9.0.tar.gz", - "last_update": "2021-06-24T07:48:54-07:00", + "filename": "electrical-file_concat-1.0.0.tar.gz", + "last_update": "2015-04-09T12:03:13-07:00", "checksums": { - "length": 42773, + "length": 13289, }, }, - "7.10.0": { + ], + }, + { + "url": "https://forge.puppet.com/modules/puppetlabs/puppetdb", + "artifacts": [ + { "url": "https://forgeapi.puppet.com/v3/files/puppetlabs-puppetdb-7.10.0.tar.gz", "version": "7.10.0", "filename": "puppetlabs-puppetdb-7.10.0.tar.gz", "last_update": "2021-12-16T14:57:46-08:00", "checksums": { "md5": "e91a2074ca8d94a8b3ff7f6c8bbf12bc", "sha256": "49b1a542fbd2a1378c16cb04809e0f88bf4f3e45979532294fb1f03f56c97fbb", }, }, - } - }, - "https://forge.puppet.com/modules/saz/memcached": { - "artifacts": { - "1.0.0": { - "url": "https://forgeapi.puppet.com/v3/files/saz-memcached-1.0.0.tar.gz", + { + "url": "https://forgeapi.puppet.com/v3/files/puppetlabs-puppetdb-7.9.0.tar.gz", + "version": "7.9.0", + "filename": "puppetlabs-puppetdb-7.9.0.tar.gz", + "last_update": "2021-06-24T07:48:54-07:00", + "checksums": { + "length": 42773, + }, + }, + { + "url": "https://forgeapi.puppet.com/v3/files/puppetlabs-puppetdb-1.0.0.tar.gz", "version": "1.0.0", - "filename": "saz-memcached-1.0.0.tar.gz", - "last_update": "2011-11-20T13:40:30-08:00", + "filename": "puppetlabs-puppetdb-1.0.0.tar.gz", + "last_update": "2012-09-19T16:51:22-07:00", "checksums": { - "length": 2472, + "length": 16336, }, }, - "8.1.0": { + ], + }, + { + "url": "https://forge.puppet.com/modules/saz/memcached", + "artifacts": [ + { "url": "https://forgeapi.puppet.com/v3/files/saz-memcached-8.1.0.tar.gz", "version": "8.1.0", "filename": "saz-memcached-8.1.0.tar.gz", "last_update": "2022-07-11T03:34:55-07:00", "checksums": { "md5": "aadf80fba5848909429eb002ee1927ea", "sha256": "883d6186e91c2c3fed13ae2009c3aa596657f6707b76f1f7efc6203c6e4ae986", }, }, - } + { + "url": "https://forgeapi.puppet.com/v3/files/saz-memcached-1.0.0.tar.gz", + "version": "1.0.0", + "filename": "saz-memcached-1.0.0.tar.gz", + "last_update": "2011-11-20T13:40:30-08:00", + "checksums": { + "length": 2472, + }, + }, + ], }, -} +] def test_puppet_lister(datadir, requests_mock_datadir, swh_scheduler): lister = PuppetLister(scheduler=swh_scheduler) res = lister.run() assert res.pages == 2 assert res.origins == 1 + 1 + 1 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == len(expected_origins) - for origin in scheduler_origins: - assert origin.visit_type == "puppet" - assert origin.url in expected_origins - assert origin.extra_loader_arguments == expected_origins[origin.url] + assert [ + ( + scheduled.visit_type, + scheduled.url, + scheduled.extra_loader_arguments["artifacts"], + ) + for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url) + ] == [ + ( + "puppet", + expected["url"], + expected["artifacts"], + ) + for expected in sorted(expected_origins, key=lambda expected: expected["url"]) + ]