diff --git a/swh/lister/aur/lister.py b/swh/lister/aur/lister.py index 2e20395..778a848 100644 --- a/swh/lister/aur/lister.py +++ b/swh/lister/aur/lister.py @@ -1,147 +1,154 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import logging from typing import Any, Dict, Iterator, List, Optional import requests from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. AurListerPage = Dict[str, Any] class AurLister(StatelessLister[AurListerPage]): """List Arch User Repository (AUR) origins. Given an url (used as a base url, default is 'https://aur.archlinux.org'), download a 'packages-meta-v1.json.gz' which contains a json file listing all existing packages definitions. Each entry describes the latest released version of a package. The origin url for a package is built using 'pkgname' and corresponds to a git repository. An rpc api exists but it is recommended to save bandwidth so it's not used. See https://lists.archlinux.org/pipermail/aur-general/2021-November/036659.html for more on this. """ LISTER_NAME = "aur" VISIT_TYPE = "aur" INSTANCE = "aur" BASE_URL = "https://aur.archlinux.org" DEFAULT_PACKAGES_INDEX_URL = "{base_url}/packages-meta-v1.json.gz" PACKAGE_VCS_URL_PATTERN = "{base_url}/{pkgname}.git" PACKAGE_SNAPSHOT_URL_PATTERN = "{base_url}/cgit/aur.git/snapshot/{pkgname}.tar.gz" ORIGIN_URL_PATTERN = "{base_url}/packages/{pkgname}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.BASE_URL, ) def download_packages_index(self) -> List[Dict[str, Any]]: """Build an url based on self.DEFAULT_PACKAGES_INDEX_URL format string, and download the archive to self.DESTINATION_PATH Returns: a directory Path where the archive has been downloaded to. """ url = self.DEFAULT_PACKAGES_INDEX_URL.format(base_url=self.url) return requests.get(url).json() def get_pages(self) -> Iterator[AurListerPage]: """Yield an iterator which returns 'page' Each page corresponds to a package with a 'version', an 'url' for a Git repository, a 'project_url' which represents the upstream project url and a canonical 'snapshot_url' from which a tar.gz archive of the package can be downloaded. """ packages = self.download_packages_index() logger.debug("Found %s AUR packages in aur_index", len(packages)) for package in packages: # Exclude lines where Name differs from PackageBase as they represents # split package and they don't have resolvable snapshots url if package["Name"] == package["PackageBase"]: logger.debug("Processing AUR package %s", package["Name"]) pkgname = package["PackageBase"] version = package["Version"] project_url = package["URL"] last_modified = datetime.datetime.fromtimestamp( float(package["LastModified"]), tz=datetime.timezone.utc ).isoformat() yield { "pkgname": pkgname, "version": version, "url": self.ORIGIN_URL_PATTERN.format( base_url=self.BASE_URL, pkgname=pkgname ), "git_url": self.PACKAGE_VCS_URL_PATTERN.format( base_url=self.BASE_URL, pkgname=pkgname ), "snapshot_url": self.PACKAGE_SNAPSHOT_URL_PATTERN.format( base_url=self.BASE_URL, pkgname=pkgname ), "project_url": project_url, "last_modified": last_modified, } def get_origins_from_page(self, origin: AurListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances. It uses the vcs (Git) url as an origin and adds `artifacts` and `aur_metadata` entries to 'extra_loader_arguments'. `artifacts` describe the file to download and `aur_metadata` store some metadata that can be useful for the loader. """ assert self.lister_obj.id is not None last_update = datetime.datetime.fromisoformat(origin["last_modified"]) filename = origin["snapshot_url"].split("/")[-1] artifacts = [ { "filename": filename, "url": origin["snapshot_url"], "version": origin["version"], } ] aur_metadata = [ { "version": origin["version"], "project_url": origin["project_url"], "last_update": origin["last_modified"], "pkgname": origin["pkgname"], } ] yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=origin["url"], last_update=last_update, extra_loader_arguments={ "artifacts": artifacts, "aur_metadata": aur_metadata, }, ) + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type="git", + url=origin["git_url"], + last_update=last_update, + ) diff --git a/swh/lister/aur/tests/test_lister.py b/swh/lister/aur/tests/test_lister.py index be36337..7b67d4a 100644 --- a/swh/lister/aur/tests/test_lister.py +++ b/swh/lister/aur/tests/test_lister.py @@ -1,136 +1,143 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import gzip import json import os from swh.lister.aur.lister import AurLister expected_origins = [ { "visit_type": "aur", "url": "https://aur.archlinux.org/packages/hg-evolve", "git_url": "https://aur.archlinux.org/hg-evolve.git", "extra_loader_arguments": { "artifacts": [ { "filename": "hg-evolve.tar.gz", "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/hg-evolve.tar.gz", # noqa: B950 "version": "10.5.1-1", } ], "aur_metadata": [ { "version": "10.5.1-1", "project_url": "https://www.mercurial-scm.org/doc/evolution/", "last_update": "2022-04-27T20:02:56+00:00", "pkgname": "hg-evolve", } ], }, }, { "visit_type": "aur", "url": "https://aur.archlinux.org/packages/ibus-git", "git_url": "https://aur.archlinux.org/ibus-git.git", "extra_loader_arguments": { "artifacts": [ { "filename": "ibus-git.tar.gz", "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/ibus-git.tar.gz", # noqa: B950 "version": "1.5.23+12+gef4c5c7e-1", } ], "aur_metadata": [ { "version": "1.5.23+12+gef4c5c7e-1", "project_url": "https://github.com/ibus/ibus/wiki", "last_update": "2021-02-08T06:12:11+00:00", "pkgname": "ibus-git", } ], }, }, { "visit_type": "aur", "url": "https://aur.archlinux.org/packages/libervia-web-hg", "git_url": "https://aur.archlinux.org/libervia-web-hg.git", "extra_loader_arguments": { "artifacts": [ { "filename": "libervia-web-hg.tar.gz", "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/libervia-web-hg.tar.gz", # noqa: B950 "version": "0.9.0.r1492.3a34d78f2717-1", } ], "aur_metadata": [ { "version": "0.9.0.r1492.3a34d78f2717-1", "project_url": "http://salut-a-toi.org/", "last_update": "2022-02-26T15:30:58+00:00", "pkgname": "libervia-web-hg", } ], }, }, { "visit_type": "aur", "url": "https://aur.archlinux.org/packages/tealdeer-git", "git_url": "https://aur.archlinux.org/tealdeer-git.git", "extra_loader_arguments": { "artifacts": [ { "filename": "tealdeer-git.tar.gz", "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/tealdeer-git.tar.gz", # noqa: B950 "version": "r255.30b7c5f-1", } ], "aur_metadata": [ { "version": "r255.30b7c5f-1", "project_url": "https://github.com/dbrgn/tealdeer", "last_update": "2020-09-04T20:36:52+00:00", "pkgname": "tealdeer-git", } ], }, }, ] def test_aur_lister(datadir, swh_scheduler, requests_mock): lister = AurLister(scheduler=swh_scheduler) packages_index_filename = "packages-meta-v1.json.gz" # simulate requests behavior: gzip and deflate transfer-encodings are automatically decoded with gzip.open(os.path.join(datadir, packages_index_filename), "rb") as f: requests_mock.get( f"{lister.BASE_URL}/{packages_index_filename}", json=json.loads(f.read()) ) res = lister.run() assert res.pages == 4 - assert res.origins == 4 + assert res.origins == 8 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + aur_origins = [origin for origin in scheduler_origins if origin.visit_type == "aur"] + git_origins = [origin for origin in scheduler_origins if origin.visit_type == "git"] + assert [ ( scheduled.visit_type, scheduled.url, scheduled.extra_loader_arguments["artifacts"], ) - for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url) + for scheduled in sorted(aur_origins, key=lambda scheduled: scheduled.url) ] == [ ( "aur", expected["url"], expected["extra_loader_arguments"]["artifacts"], ) for expected in sorted(expected_origins, key=lambda expected: expected["url"]) ] + + assert {origin.url for origin in git_origins} == { + origin["git_url"] for origin in expected_origins + }