diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ lister=swh.lister.cli [swh.workers] lister.arch=swh.lister.arch:register + lister.aur=swh.lister.aur:register lister.bitbucket=swh.lister.bitbucket:register lister.cgit=swh.lister.cgit:register lister.cran=swh.lister.cran:register diff --git a/swh/lister/aur/__init__.py b/swh/lister/aur/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/aur/__init__.py @@ -0,0 +1,135 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +AUR (Arch User Repository) lister +================================= + +The AUR lister list origins from `aur.archlinux.org`_, the Arch User Repository. +For each package, there is a git repository, we use the git url as origin and the +snapshot url as the artifact for the loader to download. + +Each git repository consist of a directory (for which name corresponds to the package name), +and at least two files, .SRCINFO and PKGBUILD which are recipes for building the package. + +Each package has a version, the latest one. There isn't any archives of previous versions, +so the lister will always list one version per package. + +As of August 2022 `aur.archlinux.org`_ list 84438 packages. Please note that this amount +is the total of `regular`_ and `split`_ packages. +We will archive `regular` and `split` packages but only their `pkgbase` because that is +the only one that actually has source code. +The packages amount is 78554 after removing the split ones. + +Origins retrieving strategy +--------------------------- + +An rpc api exists but it is recommended to save bandwidth so it's not used. See +`New AUR Metadata Archives`_ for more on this topic. + +To get an index of all AUR existing packages we download a `packages-meta-v1.json.gz`_ +which contains a json file listing all existing packages definitions. + +Each entry describes the latest released version of a package. The origin url +for a package is built using `pkgbase` and corresponds to a git repository. + +Note that we list only standard package (when pkgbase equal pkgname), not the ones +belonging to split packages. + +It takes only a couple of minutes to download the 7 MB index archive and parses its +content. + +Page listing +------------ + +Each page is related to one package. As its not possible to get all previous +versions, it will always returns one line. + +Each page corresponds to a package with a `version`, an `url` for a Git +repository, a `project_url` which represents the upstream project url and +a canonical `snapshot_url` from which a tar.gz archive of the package can +be downloaded. + +The data schema for each line is: + +* **pkgname**: Package name +* **version**: Package version +* **url**: Git repository url for a package +* **snapshot_url**: Package download url +* **project_url**: Upstream project url if any +* **last_modified**: Iso8601 last update date + +Origins from page +----------------- + +The lister yields one origin per page. +The origin url corresponds to the git url of a package, for example ``https://aur.archlinux.org/{package}.git``. + +Additionally we add some data set to "extra_loader_arguments": + +* **artifacts**: Represent data about the Aur package snapshot to download, + following :ref:`original-artifacts-json specification ` +* **aur_metadata**: To store all other interesting attributes that do not belongs to artifacts. + +Origin data example:: + + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/hg-evolve.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "hg-evolve.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/hg-evolve.tar.gz", # noqa: B950 + "version": "10.5.1-1", + } + ], + "aur_metadata": [ + { + "version": "10.5.1-1", + "project_url": "https://www.mercurial-scm.org/doc/evolution/", + "last_update": "2022-04-27T20:02:56+00:00", + "pkgname": "hg-evolve", + } + ], + }, + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory:: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/aur/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment:: + + docker-compose up -d + +Then connect to the lister:: + + docker exec -it docker_swh-lister_1 bash + +And run the lister (The output of this listing results in “oneshot” tasks in the scheduler):: + + swh lister run -l aur + +.. _aur.archlinux.org: https://aur.archlinux.org +.. _New AUR Metadata Archives: https://lists.archlinux.org/pipermail/aur-general/2021-November/036659.html +.. _packages-meta-v1.json.gz: https://aur.archlinux.org/packages-meta-v1.json.gz +.. _regular: https://wiki.archlinux.org/title/PKGBUILD#Package_name +.. _split: https://man.archlinux.org/man/PKGBUILD.5#PACKAGE_SPLITTING +""" + + +def register(): + from .lister import AurLister + + return { + "lister": AurLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/aur/lister.py b/swh/lister/aur/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/aur/lister.py @@ -0,0 +1,174 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import datetime +import gzip +import json +import logging +from pathlib import Path +import shutil +from typing import Any, Dict, Iterator, Optional + +import requests + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +AurListerPage = Dict[str, Any] + + +class AurLister(StatelessLister[AurListerPage]): + """List Arch User Repository (AUR) origins. + + Given an url (used as a base url, default is 'https://aur.archlinux.org'), + download a 'packages-meta-v1.json.gz' which contains a json file listing all + existing packages definitions. + + Each entry describes the latest released version of a package. The origin url + for a package is built using 'pkgname' and corresponds to a git repository. + + An rpc api exists but it is recommended to save bandwidth so it's not used. See + https://lists.archlinux.org/pipermail/aur-general/2021-November/036659.html + for more on this. + """ + + LISTER_NAME = "aur" + VISIT_TYPE = "aur" + INSTANCE = "aur" + + BASE_URL = "https://aur.archlinux.org" + DEFAULT_PACKAGES_INDEX_URL = "{base_url}/packages-meta-v1.json.gz" + PACKAGE_VCS_URL_PATTERN = "{base_url}/{pkgname}.git" + PACKAGE_SNAPSHOT_URL_PATTERN = "{base_url}/cgit/aur.git/snapshot/{pkgname}.tar.gz" + + DESTINATION_PATH = Path("/tmp/aur_archive") + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=self.BASE_URL, + ) + + def download_index_archive(self) -> Path: + """Build an url based on self.DEFAULT_PACKAGES_INDEX_URL format string, + and download the archive to self.DESTINATION_PATH + + Returns: + a directory Path where the archive has been downloaded to. + """ + url = self.DEFAULT_PACKAGES_INDEX_URL.format(base_url=self.url) + filename = url.split("/")[-1] + destination = Path(self.DESTINATION_PATH, filename) + + if not Path(self.DESTINATION_PATH).exists(): + Path(self.DESTINATION_PATH).mkdir() + + response = requests.get(url, stream=True) + destination.write_bytes(response.raw.read()) + assert destination.exists() + + return destination + + def get_pages(self) -> Iterator[AurListerPage]: + """Yield an iterator which returns 'page' + + Each page corresponds to a package with a 'version', an 'url' for a Git + repository, a 'project_url' which represents the upstream project url and + a canonical 'snapshot_url' from which a tar.gz archive of the package can + be downloaded. + """ + index = self.download_index_archive() + + with gzip.open(index, "rb") as f: + assert f.readable() + file_content = f.read() + packages = json.loads(file_content) + + assert packages + + counter: int = 0 + for package in packages: + # Exclude lines where Name differs from PackageBase as they represents + # split package and they don't have resolvable snapshots url + if package["Name"] == package["PackageBase"]: + pkgname = package["PackageBase"] + version = package["Version"] + project_url = package["URL"] + last_modified = datetime.datetime.fromtimestamp( + float(package["LastModified"]), tz=datetime.timezone.utc + ).isoformat() + counter += 1 + yield { + "pkgname": pkgname, + "version": version, + "url": self.PACKAGE_VCS_URL_PATTERN.format( + base_url=self.BASE_URL, pkgname=pkgname + ), + "snapshot_url": self.PACKAGE_SNAPSHOT_URL_PATTERN.format( + base_url=self.BASE_URL, pkgname=pkgname + ), + "project_url": project_url, + "last_modified": last_modified, + } + logger.debug("Found %s AUR packages in aur_index", counter) + + def get_origins_from_page(self, origin: AurListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances. + It uses the vcs (Git) url as an origin and adds `artifacts` and `aur_metadata` + entries to 'extra_loader_arguments'. + + `artifacts` describe the file to download and `aur_metadata` store some + metadata that can be useful for the loader. + """ + assert self.lister_obj.id is not None + + url = origin["url"] + last_update = datetime.datetime.fromisoformat(origin["last_modified"]) + filename = origin["snapshot_url"].split("/")[-1] + + artifacts = [ + { + "filename": filename, + "url": origin["snapshot_url"], + "version": origin["version"], + } + ] + aur_metadata = [ + { + "version": origin["version"], + "project_url": origin["project_url"], + "last_update": origin["last_modified"], + "pkgname": origin["pkgname"], + } + ] + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=url, + last_update=last_update, + extra_loader_arguments={ + "artifacts": artifacts, + "aur_metadata": aur_metadata, + }, + ) + + def finalize(self) -> None: + # Cleanup by removing the repository directory + if self.DESTINATION_PATH.exists(): + shutil.rmtree(self.DESTINATION_PATH) + logger.debug( + "Successfully removed %s directory", str(self.DESTINATION_PATH) + ) diff --git a/swh/lister/aur/tasks.py b/swh/lister/aur/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/aur/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.aur.lister import AurLister + + +@shared_task(name=__name__ + ".AurListerTask") +def list_aur(**lister_args): + """Lister task for Arch User Repository (AUR)""" + return AurLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/aur/tests/__init__.py b/swh/lister/aur/tests/__init__.py new file mode 100644 diff --git a/swh/lister/aur/tests/data/fake_aur_packages.sh b/swh/lister/aur/tests/data/fake_aur_packages.sh new file mode 100755 --- /dev/null +++ b/swh/lister/aur/tests/data/fake_aur_packages.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# Script to generate packages-meta-v1.json.gz +# files and fake http responses for https_aur.archlinux.org +# For tests purposes only + +set -euo pipefail + +# files and directories +mkdir https_aur.archlinux.org + +mkdir -p tmp_dir/archives/ +cd tmp_dir/archives/ + +echo -e '''[ +{"ID":787300,"Name":"tealdeer-git","PackageBaseID":110159,"PackageBase":"tealdeer-git","Version":"r255.30b7c5f-1","Description":"A fast tldr client in Rust.","URL":"https://github.com/dbrgn/tealdeer","NumVotes":11,"Popularity":0.009683,"OutOfDate":null,"Maintainer":"dbrgn","FirstSubmitted":1460795753,"LastModified":1599251812,"URLPath":"/cgit/aur.git/snapshot/tealdeer-git.tar.gz"}, +{"ID":860370,"Name":"ibus-git","PackageBaseID":163059,"PackageBase":"ibus-git","Version":"1.5.23+12+gef4c5c7e-1","Description":"Next Generation Input Bus for Linux","URL":"https://github.com/ibus/ibus/wiki","NumVotes":1,"Popularity":0.989573,"OutOfDate":null,"Maintainer":"tallero","FirstSubmitted":1612764731,"LastModified":1612764731,"URLPath":"/cgit/aur.git/snapshot/ibus-git.tar.gz"}, +{"ID":1043337,"Name":"libervia-web-hg","PackageBaseID":170485,"PackageBase":"libervia-web-hg","Version":"0.9.0.r1492.3a34d78f2717-1","Description":"Salut à Toi, multi-frontends multi-purposes XMPP client (Web interface)","URL":"http://salut-a-toi.org/","NumVotes":0,"Popularity":0.0,"OutOfDate":null,"Maintainer":"jnanar","FirstSubmitted":1630224837,"LastModified":1645889458,"URLPath":"/cgit/aur.git/snapshot/libervia-web-hg.tar.gz"}, +{"ID":1072642,"Name":"hg-evolve","PackageBaseID":135047,"PackageBase":"hg-evolve","Version":"10.5.1-1","Description":"Flexible evolution of Mercurial history","URL":"https://www.mercurial-scm.org/doc/evolution/","NumVotes":6,"Popularity":0.003887,"OutOfDate":null,"Maintainer":"damien-43","FirstSubmitted":1534190432,"LastModified":1651089776,"URLPath":"/cgit/aur.git/snapshot/hg-evolve.tar.gz"} +]''' > packages-meta-v1.json + +# Gzip archive +gzip -c packages-meta-v1.json > ../../https_aur.archlinux.org/packages-meta-v1.json.gz + +# Clean up removing tmp_dir +cd ../../ +rm -rf tmp_dir/ diff --git a/swh/lister/aur/tests/data/https_aur.archlinux.org/packages-meta-v1.json.gz b/swh/lister/aur/tests/data/https_aur.archlinux.org/packages-meta-v1.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@