diff --git a/setup.py b/setup.py index 5bb77b6..2340cab 100755 --- a/setup.py +++ b/setup.py @@ -1,92 +1,93 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import open from os import path from setuptools import find_packages, setup here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( name="swh.lister", description="Software Heritage lister", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DLSGH/", packages=find_packages(), install_requires=parse_requirements() + parse_requirements("swh"), tests_require=parse_requirements("test"), setup_requires=["setuptools-scm"], extras_require={"testing": parse_requirements("test")}, use_scm_version=True, include_package_data=True, entry_points=""" [swh.cli.subcommands] lister=swh.lister.cli [swh.workers] lister.arch=swh.lister.arch:register + lister.aur=swh.lister.aur:register lister.bitbucket=swh.lister.bitbucket:register lister.cgit=swh.lister.cgit:register lister.cran=swh.lister.cran:register lister.crates=swh.lister.crates:register lister.debian=swh.lister.debian:register lister.gitea=swh.lister.gitea:register lister.github=swh.lister.github:register lister.gitlab=swh.lister.gitlab:register lister.gnu=swh.lister.gnu:register lister.launchpad=swh.lister.launchpad:register lister.npm=swh.lister.npm:register lister.opam=swh.lister.opam:register lister.packagist=swh.lister.packagist:register lister.phabricator=swh.lister.phabricator:register lister.pypi=swh.lister.pypi:register lister.sourceforge=swh.lister.sourceforge:register lister.tuleap=swh.lister.tuleap:register lister.maven=swh.lister.maven:register lister.gogs=swh.lister.gogs:register """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-lister", "Documentation": "https://docs.softwareheritage.org/devel/swh-lister/", }, ) diff --git a/swh/lister/aur/__init__.py b/swh/lister/aur/__init__.py new file mode 100644 index 0000000..d6db8a2 --- /dev/null +++ b/swh/lister/aur/__init__.py @@ -0,0 +1,135 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +AUR (Arch User Repository) lister +================================= + +The AUR lister list origins from `aur.archlinux.org`_, the Arch User Repository. +For each package, there is a git repository, we use the git url as origin and the +snapshot url as the artifact for the loader to download. + +Each git repository consist of a directory (for which name corresponds to the package name), +and at least two files, .SRCINFO and PKGBUILD which are recipes for building the package. + +Each package has a version, the latest one. There isn't any archives of previous versions, +so the lister will always list one version per package. + +As of August 2022 `aur.archlinux.org`_ list 84438 packages. Please note that this amount +is the total of `regular`_ and `split`_ packages. +We will archive `regular` and `split` packages but only their `pkgbase` because that is +the only one that actually has source code. +The packages amount is 78554 after removing the split ones. + +Origins retrieving strategy +--------------------------- + +An rpc api exists but it is recommended to save bandwidth so it's not used. See +`New AUR Metadata Archives`_ for more on this topic. + +To get an index of all AUR existing packages we download a `packages-meta-v1.json.gz`_ +which contains a json file listing all existing packages definitions. + +Each entry describes the latest released version of a package. The origin url +for a package is built using `pkgbase` and corresponds to a git repository. + +Note that we list only standard package (when pkgbase equal pkgname), not the ones +belonging to split packages. + +It takes only a couple of minutes to download the 7 MB index archive and parses its +content. + +Page listing +------------ + +Each page is related to one package. As its not possible to get all previous +versions, it will always returns one line. + +Each page corresponds to a package with a `version`, an `url` for a Git +repository, a `project_url` which represents the upstream project url and +a canonical `snapshot_url` from which a tar.gz archive of the package can +be downloaded. + +The data schema for each line is: + +* **pkgname**: Package name +* **version**: Package version +* **url**: Git repository url for a package +* **snapshot_url**: Package download url +* **project_url**: Upstream project url if any +* **last_modified**: Iso8601 last update date + +Origins from page +----------------- + +The lister yields one origin per page. +The origin url corresponds to the git url of a package, for example ``https://aur.archlinux.org/{package}.git``. + +Additionally we add some data set to "extra_loader_arguments": + +* **artifacts**: Represent data about the Aur package snapshot to download, + following :ref:`original-artifacts-json specification ` +* **aur_metadata**: To store all other interesting attributes that do not belongs to artifacts. + +Origin data example:: + + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/hg-evolve.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "hg-evolve.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/hg-evolve.tar.gz", # noqa: B950 + "version": "10.5.1-1", + } + ], + "aur_metadata": [ + { + "version": "10.5.1-1", + "project_url": "https://www.mercurial-scm.org/doc/evolution/", + "last_update": "2022-04-27T20:02:56+00:00", + "pkgname": "hg-evolve", + } + ], + }, + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory:: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/aur/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment:: + + docker-compose up -d + +Then connect to the lister:: + + docker exec -it docker_swh-lister_1 bash + +And run the lister (The output of this listing results in “oneshot” tasks in the scheduler):: + + swh lister run -l aur + +.. _aur.archlinux.org: https://aur.archlinux.org +.. _New AUR Metadata Archives: https://lists.archlinux.org/pipermail/aur-general/2021-November/036659.html +.. _packages-meta-v1.json.gz: https://aur.archlinux.org/packages-meta-v1.json.gz +.. _regular: https://wiki.archlinux.org/title/PKGBUILD#Package_name +.. _split: https://man.archlinux.org/man/PKGBUILD.5#PACKAGE_SPLITTING +""" + + +def register(): + from .lister import AurLister + + return { + "lister": AurLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/aur/lister.py b/swh/lister/aur/lister.py new file mode 100644 index 0000000..47586ce --- /dev/null +++ b/swh/lister/aur/lister.py @@ -0,0 +1,174 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import datetime +import gzip +import json +import logging +from pathlib import Path +import shutil +from typing import Any, Dict, Iterator, Optional + +import requests + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +AurListerPage = Dict[str, Any] + + +class AurLister(StatelessLister[AurListerPage]): + """List Arch User Repository (AUR) origins. + + Given an url (used as a base url, default is 'https://aur.archlinux.org'), + download a 'packages-meta-v1.json.gz' which contains a json file listing all + existing packages definitions. + + Each entry describes the latest released version of a package. The origin url + for a package is built using 'pkgname' and corresponds to a git repository. + + An rpc api exists but it is recommended to save bandwidth so it's not used. See + https://lists.archlinux.org/pipermail/aur-general/2021-November/036659.html + for more on this. + """ + + LISTER_NAME = "aur" + VISIT_TYPE = "aur" + INSTANCE = "aur" + + BASE_URL = "https://aur.archlinux.org" + DEFAULT_PACKAGES_INDEX_URL = "{base_url}/packages-meta-v1.json.gz" + PACKAGE_VCS_URL_PATTERN = "{base_url}/{pkgname}.git" + PACKAGE_SNAPSHOT_URL_PATTERN = "{base_url}/cgit/aur.git/snapshot/{pkgname}.tar.gz" + + DESTINATION_PATH = Path("/tmp/aur_archive") + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=self.BASE_URL, + ) + + def download_index_archive(self) -> Path: + """Build an url based on self.DEFAULT_PACKAGES_INDEX_URL format string, + and download the archive to self.DESTINATION_PATH + + Returns: + a directory Path where the archive has been downloaded to. + """ + url = self.DEFAULT_PACKAGES_INDEX_URL.format(base_url=self.url) + filename = url.split("/")[-1] + destination = Path(self.DESTINATION_PATH, filename) + + if not Path(self.DESTINATION_PATH).exists(): + Path(self.DESTINATION_PATH).mkdir() + + response = requests.get(url, stream=True) + destination.write_bytes(response.raw.read()) + assert destination.exists() + + return destination + + def get_pages(self) -> Iterator[AurListerPage]: + """Yield an iterator which returns 'page' + + Each page corresponds to a package with a 'version', an 'url' for a Git + repository, a 'project_url' which represents the upstream project url and + a canonical 'snapshot_url' from which a tar.gz archive of the package can + be downloaded. + """ + index = self.download_index_archive() + + with gzip.open(index, "rb") as f: + assert f.readable() + file_content = f.read() + packages = json.loads(file_content) + + assert packages + + counter: int = 0 + for package in packages: + # Exclude lines where Name differs from PackageBase as they represents + # split package and they don't have resolvable snapshots url + if package["Name"] == package["PackageBase"]: + pkgname = package["PackageBase"] + version = package["Version"] + project_url = package["URL"] + last_modified = datetime.datetime.fromtimestamp( + float(package["LastModified"]), tz=datetime.timezone.utc + ).isoformat() + counter += 1 + yield { + "pkgname": pkgname, + "version": version, + "url": self.PACKAGE_VCS_URL_PATTERN.format( + base_url=self.BASE_URL, pkgname=pkgname + ), + "snapshot_url": self.PACKAGE_SNAPSHOT_URL_PATTERN.format( + base_url=self.BASE_URL, pkgname=pkgname + ), + "project_url": project_url, + "last_modified": last_modified, + } + logger.debug("Found %s AUR packages in aur_index", counter) + + def get_origins_from_page(self, origin: AurListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances. + It uses the vcs (Git) url as an origin and adds `artifacts` and `aur_metadata` + entries to 'extra_loader_arguments'. + + `artifacts` describe the file to download and `aur_metadata` store some + metadata that can be useful for the loader. + """ + assert self.lister_obj.id is not None + + url = origin["url"] + last_update = datetime.datetime.fromisoformat(origin["last_modified"]) + filename = origin["snapshot_url"].split("/")[-1] + + artifacts = [ + { + "filename": filename, + "url": origin["snapshot_url"], + "version": origin["version"], + } + ] + aur_metadata = [ + { + "version": origin["version"], + "project_url": origin["project_url"], + "last_update": origin["last_modified"], + "pkgname": origin["pkgname"], + } + ] + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=url, + last_update=last_update, + extra_loader_arguments={ + "artifacts": artifacts, + "aur_metadata": aur_metadata, + }, + ) + + def finalize(self) -> None: + # Cleanup by removing the repository directory + if self.DESTINATION_PATH.exists(): + shutil.rmtree(self.DESTINATION_PATH) + logger.debug( + "Successfully removed %s directory", str(self.DESTINATION_PATH) + ) diff --git a/swh/lister/aur/tasks.py b/swh/lister/aur/tasks.py new file mode 100644 index 0000000..52de9db --- /dev/null +++ b/swh/lister/aur/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.aur.lister import AurLister + + +@shared_task(name=__name__ + ".AurListerTask") +def list_aur(**lister_args): + """Lister task for Arch User Repository (AUR)""" + return AurLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/aur/tests/__init__.py b/swh/lister/aur/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/aur/tests/data/fake_aur_packages.sh b/swh/lister/aur/tests/data/fake_aur_packages.sh new file mode 100755 index 0000000..26ad1e3 --- /dev/null +++ b/swh/lister/aur/tests/data/fake_aur_packages.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# Script to generate packages-meta-v1.json.gz +# files and fake http responses for https_aur.archlinux.org +# For tests purposes only + +set -euo pipefail + +# files and directories +mkdir https_aur.archlinux.org + +mkdir -p tmp_dir/archives/ +cd tmp_dir/archives/ + +echo -e '''[ +{"ID":787300,"Name":"tealdeer-git","PackageBaseID":110159,"PackageBase":"tealdeer-git","Version":"r255.30b7c5f-1","Description":"A fast tldr client in Rust.","URL":"https://github.com/dbrgn/tealdeer","NumVotes":11,"Popularity":0.009683,"OutOfDate":null,"Maintainer":"dbrgn","FirstSubmitted":1460795753,"LastModified":1599251812,"URLPath":"/cgit/aur.git/snapshot/tealdeer-git.tar.gz"}, +{"ID":860370,"Name":"ibus-git","PackageBaseID":163059,"PackageBase":"ibus-git","Version":"1.5.23+12+gef4c5c7e-1","Description":"Next Generation Input Bus for Linux","URL":"https://github.com/ibus/ibus/wiki","NumVotes":1,"Popularity":0.989573,"OutOfDate":null,"Maintainer":"tallero","FirstSubmitted":1612764731,"LastModified":1612764731,"URLPath":"/cgit/aur.git/snapshot/ibus-git.tar.gz"}, +{"ID":1043337,"Name":"libervia-web-hg","PackageBaseID":170485,"PackageBase":"libervia-web-hg","Version":"0.9.0.r1492.3a34d78f2717-1","Description":"Salut à Toi, multi-frontends multi-purposes XMPP client (Web interface)","URL":"http://salut-a-toi.org/","NumVotes":0,"Popularity":0.0,"OutOfDate":null,"Maintainer":"jnanar","FirstSubmitted":1630224837,"LastModified":1645889458,"URLPath":"/cgit/aur.git/snapshot/libervia-web-hg.tar.gz"}, +{"ID":1072642,"Name":"hg-evolve","PackageBaseID":135047,"PackageBase":"hg-evolve","Version":"10.5.1-1","Description":"Flexible evolution of Mercurial history","URL":"https://www.mercurial-scm.org/doc/evolution/","NumVotes":6,"Popularity":0.003887,"OutOfDate":null,"Maintainer":"damien-43","FirstSubmitted":1534190432,"LastModified":1651089776,"URLPath":"/cgit/aur.git/snapshot/hg-evolve.tar.gz"} +]''' > packages-meta-v1.json + +# Gzip archive +gzip -c packages-meta-v1.json > ../../https_aur.archlinux.org/packages-meta-v1.json.gz + +# Clean up removing tmp_dir +cd ../../ +rm -rf tmp_dir/ diff --git a/swh/lister/aur/tests/data/https_aur.archlinux.org/packages-meta-v1.json.gz b/swh/lister/aur/tests/data/https_aur.archlinux.org/packages-meta-v1.json.gz new file mode 100644 index 0000000..56b8241 Binary files /dev/null and b/swh/lister/aur/tests/data/https_aur.archlinux.org/packages-meta-v1.json.gz differ diff --git a/swh/lister/aur/tests/test_lister.py b/swh/lister/aur/tests/test_lister.py new file mode 100644 index 0000000..c403dad --- /dev/null +++ b/swh/lister/aur/tests/test_lister.py @@ -0,0 +1,131 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from swh.lister.aur.lister import AurLister + +expected_origins = [ + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/hg-evolve.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "hg-evolve.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/hg-evolve.tar.gz", # noqa: B950 + "version": "10.5.1-1", + } + ], + "aur_metadata": [ + { + "version": "10.5.1-1", + "project_url": "https://www.mercurial-scm.org/doc/evolution/", + "last_update": "2022-04-27T20:02:56+00:00", + "pkgname": "hg-evolve", + } + ], + }, + }, + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/ibus-git.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "ibus-git.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/ibus-git.tar.gz", # noqa: B950 + "version": "1.5.23+12+gef4c5c7e-1", + } + ], + "aur_metadata": [ + { + "version": "1.5.23+12+gef4c5c7e-1", + "project_url": "https://github.com/ibus/ibus/wiki", + "last_update": "2021-02-08T06:12:11+00:00", + "pkgname": "ibus-git", + } + ], + }, + }, + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/libervia-web-hg.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "libervia-web-hg.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/libervia-web-hg.tar.gz", # noqa: B950 + "version": "0.9.0.r1492.3a34d78f2717-1", + } + ], + "aur_metadata": [ + { + "version": "0.9.0.r1492.3a34d78f2717-1", + "project_url": "http://salut-a-toi.org/", + "last_update": "2022-02-26T15:30:58+00:00", + "pkgname": "libervia-web-hg", + } + ], + }, + }, + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/tealdeer-git.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "tealdeer-git.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/tealdeer-git.tar.gz", # noqa: B950 + "version": "r255.30b7c5f-1", + } + ], + "aur_metadata": [ + { + "version": "r255.30b7c5f-1", + "project_url": "https://github.com/dbrgn/tealdeer", + "last_update": "2020-09-04T20:36:52+00:00", + "pkgname": "tealdeer-git", + } + ], + }, + }, +] + + +def test_aur_lister(datadir, requests_mock_datadir, swh_scheduler): + lister = AurLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 4 + assert res.origins == 4 + + scheduler_origins_sorted = sorted( + swh_scheduler.get_listed_origins(lister.lister_obj.id).results, + key=lambda x: x.url, + ) + expected_origins_sorted = sorted(expected_origins, key=lambda x: x.get("url")) + + assert len(scheduler_origins_sorted) == len(expected_origins_sorted) + + assert [ + ( + scheduled.visit_type, + scheduled.url, + scheduled.extra_loader_arguments.get("artifacts"), + ) + for scheduled in scheduler_origins_sorted + ] == [ + ( + "aur", + expected.get("url"), + expected.get("extra_loader_arguments").get("artifacts"), + ) + for expected in expected_origins_sorted + ] + + +def test_aur_lister_directory_cleanup(datadir, requests_mock_datadir, swh_scheduler): + lister = AurLister(scheduler=swh_scheduler) + lister.run() + # Repository directory should not exists after the lister runs + assert not lister.DESTINATION_PATH.exists() diff --git a/swh/lister/aur/tests/test_tasks.py b/swh/lister/aur/tests/test_tasks.py new file mode 100644 index 0000000..44e72d1 --- /dev/null +++ b/swh/lister/aur/tests/test_tasks.py @@ -0,0 +1,31 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_aur_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.aur.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_aur_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked AurLister + lister = mocker.patch("swh.lister.aur.tasks.AurLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.aur.tasks.AurListerTask") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with()