Page MenuHomeSoftware Heritage

D8033.id29849.diff
No OneTemporary

D8033.id29849.diff

diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -56,6 +56,7 @@
lister=swh.lister.cli
[swh.workers]
lister.arch=swh.lister.arch:register
+ lister.aur=swh.lister.aur:register
lister.bitbucket=swh.lister.bitbucket:register
lister.cgit=swh.lister.cgit:register
lister.cran=swh.lister.cran:register
diff --git a/swh/lister/aur/__init__.py b/swh/lister/aur/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/aur/__init__.py
@@ -0,0 +1,135 @@
+# Copyright (C) 2022 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+"""
+AUR (Arch User Repository) lister
+=================================
+
+The AUR lister list origins from `aur.archlinux.org`_, the Arch User Repository.
+For each package, there is a git repository, we use the git url as origin and the
+snapshot url as the artifact for the loader to download.
+
+Each git repository consist of a directory (for which name corresponds to the package name),
+and at least two files, .SRCINFO and PKGBUILD which are recipes for building the package.
+
+Each package has a version, the latest one. There isn't any archives of previous versions,
+so the lister will always list one version per package.
+
+As of August 2022 `aur.archlinux.org`_ list 84438 packages. Please note that this amount
+is the total of `regular`_ and `split`_ packages.
+We will archive `regular` and `split` packages but only their `pkgbase` because that is
+the only one that actually has source code.
+The packages amount is 78554 after removing the split ones.
+
+Origins retrieving strategy
+---------------------------
+
+An rpc api exists but it is recommended to save bandwidth so it's not used. See
+`New AUR Metadata Archives`_ for more on this topic.
+
+To get an index of all AUR existing packages we download a `packages-meta-v1.json.gz`_
+which contains a json file listing all existing packages definitions.
+
+Each entry describes the latest released version of a package. The origin url
+for a package is built using `pkgbase` and corresponds to a git repository.
+
+Note that we list only standard package (when pkgbase equal pkgname), not the ones
+belonging to split packages.
+
+It takes only a couple of minutes to download the 7 MB index archive and parses its
+content.
+
+Page listing
+------------
+
+Each page is related to one package. As its not possible to get all previous
+versions, it will always returns one line.
+
+Each page corresponds to a package with a `version`, an `url` for a Git
+repository, a `project_url` which represents the upstream project url and
+a canonical `snapshot_url` from which a tar.gz archive of the package can
+be downloaded.
+
+The data schema for each line is:
+
+* **pkgname**: Package name
+* **version**: Package version
+* **url**: Git repository url for a package
+* **snapshot_url**: Package download url
+* **project_url**: Upstream project url if any
+* **last_modified**: Iso8601 last update date
+
+Origins from page
+-----------------
+
+The lister yields one origin per page.
+The origin url corresponds to the git url of a package, for example ``https://aur.archlinux.org/{package}.git``.
+
+Additionally we add some data set to "extra_loader_arguments":
+
+* **artifacts**: Represent data about the Aur package snapshot to download,
+ following :ref:`original-artifacts-json specification <original-artifacts-json>`
+* **aur_metadata**: To store all other interesting attributes that do not belongs to artifacts.
+
+Origin data example::
+
+ {
+ "visit_type": "aur",
+ "url": "https://aur.archlinux.org/hg-evolve.git",
+ "extra_loader_arguments": {
+ "artifacts": [
+ {
+ "filename": "hg-evolve.tar.gz",
+ "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/hg-evolve.tar.gz", # noqa: B950
+ "version": "10.5.1-1",
+ }
+ ],
+ "aur_metadata": [
+ {
+ "version": "10.5.1-1",
+ "project_url": "https://www.mercurial-scm.org/doc/evolution/",
+ "last_update": "2022-04-27T20:02:56+00:00",
+ "pkgname": "hg-evolve",
+ }
+ ],
+ },
+
+Running tests
+-------------
+
+Activate the virtualenv and run from within swh-lister directory::
+
+ pytest -s -vv --log-cli-level=DEBUG swh/lister/aur/tests
+
+Testing with Docker
+-------------------
+
+Change directory to swh/docker then launch the docker environment::
+
+ docker-compose up -d
+
+Then connect to the lister::
+
+ docker exec -it docker_swh-lister_1 bash
+
+And run the lister (The output of this listing results in “oneshot” tasks in the scheduler)::
+
+ swh lister run -l aur
+
+.. _aur.archlinux.org: https://aur.archlinux.org
+.. _New AUR Metadata Archives: https://lists.archlinux.org/pipermail/aur-general/2021-November/036659.html
+.. _packages-meta-v1.json.gz: https://aur.archlinux.org/packages-meta-v1.json.gz
+.. _regular: https://wiki.archlinux.org/title/PKGBUILD#Package_name
+.. _split: https://man.archlinux.org/man/PKGBUILD.5#PACKAGE_SPLITTING
+"""
+
+
+def register():
+ from .lister import AurLister
+
+ return {
+ "lister": AurLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/aur/lister.py b/swh/lister/aur/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/aur/lister.py
@@ -0,0 +1,174 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+import datetime
+import gzip
+import json
+import logging
+from pathlib import Path
+import shutil
+from typing import Any, Dict, Iterator, Optional
+
+import requests
+
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+# Aliasing the page results returned by `get_pages` method from the lister.
+AurListerPage = Dict[str, Any]
+
+
+class AurLister(StatelessLister[AurListerPage]):
+ """List Arch User Repository (AUR) origins.
+
+ Given an url (used as a base url, default is 'https://aur.archlinux.org'),
+ download a 'packages-meta-v1.json.gz' which contains a json file listing all
+ existing packages definitions.
+
+ Each entry describes the latest released version of a package. The origin url
+ for a package is built using 'pkgname' and corresponds to a git repository.
+
+ An rpc api exists but it is recommended to save bandwidth so it's not used. See
+ https://lists.archlinux.org/pipermail/aur-general/2021-November/036659.html
+ for more on this.
+ """
+
+ LISTER_NAME = "aur"
+ VISIT_TYPE = "aur"
+ INSTANCE = "aur"
+
+ BASE_URL = "https://aur.archlinux.org"
+ DEFAULT_PACKAGES_INDEX_URL = "{base_url}/packages-meta-v1.json.gz"
+ PACKAGE_VCS_URL_PATTERN = "{base_url}/{pkgname}.git"
+ PACKAGE_SNAPSHOT_URL_PATTERN = "{base_url}/cgit/aur.git/snapshot/{pkgname}.tar.gz"
+
+ DESTINATION_PATH = Path("/tmp/aur_archive")
+
+ def __init__(
+ self,
+ scheduler: SchedulerInterface,
+ credentials: Optional[CredentialsType] = None,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ credentials=credentials,
+ instance=self.INSTANCE,
+ url=self.BASE_URL,
+ )
+
+ def download_index_archive(self) -> Path:
+ """Build an url based on self.DEFAULT_PACKAGES_INDEX_URL format string,
+ and download the archive to self.DESTINATION_PATH
+
+ Returns:
+ a directory Path where the archive has been downloaded to.
+ """
+ url = self.DEFAULT_PACKAGES_INDEX_URL.format(base_url=self.url)
+ filename = url.split("/")[-1]
+ destination = Path(self.DESTINATION_PATH, filename)
+
+ if not Path(self.DESTINATION_PATH).exists():
+ Path(self.DESTINATION_PATH).mkdir()
+
+ response = requests.get(url, stream=True)
+ destination.write_bytes(response.raw.read())
+ assert destination.exists()
+
+ return destination
+
+ def get_pages(self) -> Iterator[AurListerPage]:
+ """Yield an iterator which returns 'page'
+
+ Each page corresponds to a package with a 'version', an 'url' for a Git
+ repository, a 'project_url' which represents the upstream project url and
+ a canonical 'snapshot_url' from which a tar.gz archive of the package can
+ be downloaded.
+ """
+ index = self.download_index_archive()
+
+ with gzip.open(index, "rb") as f:
+ assert f.readable()
+ file_content = f.read()
+ packages = json.loads(file_content)
+
+ assert packages
+
+ counter: int = 0
+ for package in packages:
+ # Exclude lines where Name differs from PackageBase as they represents
+ # split package and they don't have resolvable snapshots url
+ if package["Name"] == package["PackageBase"]:
+ pkgname = package["PackageBase"]
+ version = package["Version"]
+ project_url = package["URL"]
+ last_modified = datetime.datetime.fromtimestamp(
+ float(package["LastModified"]), tz=datetime.timezone.utc
+ ).isoformat()
+ counter += 1
+ yield {
+ "pkgname": pkgname,
+ "version": version,
+ "url": self.PACKAGE_VCS_URL_PATTERN.format(
+ base_url=self.BASE_URL, pkgname=pkgname
+ ),
+ "snapshot_url": self.PACKAGE_SNAPSHOT_URL_PATTERN.format(
+ base_url=self.BASE_URL, pkgname=pkgname
+ ),
+ "project_url": project_url,
+ "last_modified": last_modified,
+ }
+ logger.debug("Found %s AUR packages in aur_index", counter)
+
+ def get_origins_from_page(self, origin: AurListerPage) -> Iterator[ListedOrigin]:
+ """Iterate on all pages and yield ListedOrigin instances.
+ It uses the vcs (Git) url as an origin and adds `artifacts` and `aur_metadata`
+ entries to 'extra_loader_arguments'.
+
+ `artifacts` describe the file to download and `aur_metadata` store some
+ metadata that can be useful for the loader.
+ """
+ assert self.lister_obj.id is not None
+
+ url = origin["url"]
+ last_update = datetime.datetime.fromisoformat(origin["last_modified"])
+ filename = origin["snapshot_url"].split("/")[-1]
+
+ artifacts = [
+ {
+ "filename": filename,
+ "url": origin["snapshot_url"],
+ "version": origin["version"],
+ }
+ ]
+ aur_metadata = [
+ {
+ "version": origin["version"],
+ "project_url": origin["project_url"],
+ "last_update": origin["last_modified"],
+ "pkgname": origin["pkgname"],
+ }
+ ]
+
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ visit_type=self.VISIT_TYPE,
+ url=url,
+ last_update=last_update,
+ extra_loader_arguments={
+ "artifacts": artifacts,
+ "aur_metadata": aur_metadata,
+ },
+ )
+
+ def finalize(self) -> None:
+ # Cleanup by removing the repository directory
+ if self.DESTINATION_PATH.exists():
+ shutil.rmtree(self.DESTINATION_PATH)
+ logger.debug(
+ "Successfully removed %s directory", str(self.DESTINATION_PATH)
+ )
diff --git a/swh/lister/aur/tasks.py b/swh/lister/aur/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/aur/tasks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.lister.aur.lister import AurLister
+
+
+@shared_task(name=__name__ + ".AurListerTask")
+def list_aur(**lister_args):
+ """Lister task for Arch User Repository (AUR)"""
+ return AurLister.from_configfile(**lister_args).run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping():
+ return "OK"
diff --git a/swh/lister/aur/tests/__init__.py b/swh/lister/aur/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/aur/tests/data/fake_aur_packages.sh b/swh/lister/aur/tests/data/fake_aur_packages.sh
new file mode 100755
--- /dev/null
+++ b/swh/lister/aur/tests/data/fake_aur_packages.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+# Script to generate packages-meta-v1.json.gz
+# files and fake http responses for https_aur.archlinux.org
+# For tests purposes only
+
+set -euo pipefail
+
+# files and directories
+mkdir https_aur.archlinux.org
+
+mkdir -p tmp_dir/archives/
+cd tmp_dir/archives/
+
+echo -e '''[
+{"ID":787300,"Name":"tealdeer-git","PackageBaseID":110159,"PackageBase":"tealdeer-git","Version":"r255.30b7c5f-1","Description":"A fast tldr client in Rust.","URL":"https://github.com/dbrgn/tealdeer","NumVotes":11,"Popularity":0.009683,"OutOfDate":null,"Maintainer":"dbrgn","FirstSubmitted":1460795753,"LastModified":1599251812,"URLPath":"/cgit/aur.git/snapshot/tealdeer-git.tar.gz"},
+{"ID":860370,"Name":"ibus-git","PackageBaseID":163059,"PackageBase":"ibus-git","Version":"1.5.23+12+gef4c5c7e-1","Description":"Next Generation Input Bus for Linux","URL":"https://github.com/ibus/ibus/wiki","NumVotes":1,"Popularity":0.989573,"OutOfDate":null,"Maintainer":"tallero","FirstSubmitted":1612764731,"LastModified":1612764731,"URLPath":"/cgit/aur.git/snapshot/ibus-git.tar.gz"},
+{"ID":1043337,"Name":"libervia-web-hg","PackageBaseID":170485,"PackageBase":"libervia-web-hg","Version":"0.9.0.r1492.3a34d78f2717-1","Description":"Salut à Toi, multi-frontends multi-purposes XMPP client (Web interface)","URL":"http://salut-a-toi.org/","NumVotes":0,"Popularity":0.0,"OutOfDate":null,"Maintainer":"jnanar","FirstSubmitted":1630224837,"LastModified":1645889458,"URLPath":"/cgit/aur.git/snapshot/libervia-web-hg.tar.gz"},
+{"ID":1072642,"Name":"hg-evolve","PackageBaseID":135047,"PackageBase":"hg-evolve","Version":"10.5.1-1","Description":"Flexible evolution of Mercurial history","URL":"https://www.mercurial-scm.org/doc/evolution/","NumVotes":6,"Popularity":0.003887,"OutOfDate":null,"Maintainer":"damien-43","FirstSubmitted":1534190432,"LastModified":1651089776,"URLPath":"/cgit/aur.git/snapshot/hg-evolve.tar.gz"}
+]''' > packages-meta-v1.json
+
+# Gzip archive
+gzip -c packages-meta-v1.json > ../../https_aur.archlinux.org/packages-meta-v1.json.gz
+
+# Clean up removing tmp_dir
+cd ../../
+rm -rf tmp_dir/
diff --git a/swh/lister/aur/tests/data/https_aur.archlinux.org/packages-meta-v1.json.gz b/swh/lister/aur/tests/data/https_aur.archlinux.org/packages-meta-v1.json.gz
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/aur/tests/test_lister.py b/swh/lister/aur/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/aur/tests/test_lister.py
@@ -0,0 +1,131 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+from swh.lister.aur.lister import AurLister
+
+expected_origins = [
+ {
+ "visit_type": "aur",
+ "url": "https://aur.archlinux.org/hg-evolve.git",
+ "extra_loader_arguments": {
+ "artifacts": [
+ {
+ "filename": "hg-evolve.tar.gz",
+ "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/hg-evolve.tar.gz", # noqa: B950
+ "version": "10.5.1-1",
+ }
+ ],
+ "aur_metadata": [
+ {
+ "version": "10.5.1-1",
+ "project_url": "https://www.mercurial-scm.org/doc/evolution/",
+ "last_update": "2022-04-27T20:02:56+00:00",
+ "pkgname": "hg-evolve",
+ }
+ ],
+ },
+ },
+ {
+ "visit_type": "aur",
+ "url": "https://aur.archlinux.org/ibus-git.git",
+ "extra_loader_arguments": {
+ "artifacts": [
+ {
+ "filename": "ibus-git.tar.gz",
+ "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/ibus-git.tar.gz", # noqa: B950
+ "version": "1.5.23+12+gef4c5c7e-1",
+ }
+ ],
+ "aur_metadata": [
+ {
+ "version": "1.5.23+12+gef4c5c7e-1",
+ "project_url": "https://github.com/ibus/ibus/wiki",
+ "last_update": "2021-02-08T06:12:11+00:00",
+ "pkgname": "ibus-git",
+ }
+ ],
+ },
+ },
+ {
+ "visit_type": "aur",
+ "url": "https://aur.archlinux.org/libervia-web-hg.git",
+ "extra_loader_arguments": {
+ "artifacts": [
+ {
+ "filename": "libervia-web-hg.tar.gz",
+ "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/libervia-web-hg.tar.gz", # noqa: B950
+ "version": "0.9.0.r1492.3a34d78f2717-1",
+ }
+ ],
+ "aur_metadata": [
+ {
+ "version": "0.9.0.r1492.3a34d78f2717-1",
+ "project_url": "http://salut-a-toi.org/",
+ "last_update": "2022-02-26T15:30:58+00:00",
+ "pkgname": "libervia-web-hg",
+ }
+ ],
+ },
+ },
+ {
+ "visit_type": "aur",
+ "url": "https://aur.archlinux.org/tealdeer-git.git",
+ "extra_loader_arguments": {
+ "artifacts": [
+ {
+ "filename": "tealdeer-git.tar.gz",
+ "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/tealdeer-git.tar.gz", # noqa: B950
+ "version": "r255.30b7c5f-1",
+ }
+ ],
+ "aur_metadata": [
+ {
+ "version": "r255.30b7c5f-1",
+ "project_url": "https://github.com/dbrgn/tealdeer",
+ "last_update": "2020-09-04T20:36:52+00:00",
+ "pkgname": "tealdeer-git",
+ }
+ ],
+ },
+ },
+]
+
+
+def test_aur_lister(datadir, requests_mock_datadir, swh_scheduler):
+ lister = AurLister(scheduler=swh_scheduler)
+ res = lister.run()
+
+ assert res.pages == 4
+ assert res.origins == 4
+
+ scheduler_origins_sorted = sorted(
+ swh_scheduler.get_listed_origins(lister.lister_obj.id).results,
+ key=lambda x: x.url,
+ )
+ expected_origins_sorted = sorted(expected_origins, key=lambda x: x.get("url"))
+
+ assert len(scheduler_origins_sorted) == len(expected_origins_sorted)
+
+ assert [
+ (
+ scheduled.visit_type,
+ scheduled.url,
+ scheduled.extra_loader_arguments.get("artifacts"),
+ )
+ for scheduled in scheduler_origins_sorted
+ ] == [
+ (
+ "aur",
+ expected.get("url"),
+ expected.get("extra_loader_arguments").get("artifacts"),
+ )
+ for expected in expected_origins_sorted
+ ]
+
+
+def test_aur_lister_directory_cleanup(datadir, requests_mock_datadir, swh_scheduler):
+ lister = AurLister(scheduler=swh_scheduler)
+ lister.run()
+ # Repository directory should not exists after the lister runs
+ assert not lister.DESTINATION_PATH.exists()
diff --git a/swh/lister/aur/tests/test_tasks.py b/swh/lister/aur/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/aur/tests/test_tasks.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_aur_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.aur.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_aur_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ # setup the mocked AurLister
+ lister = mocker.patch("swh.lister.aur.tasks.AurLister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=42, origins=42)
+ lister.run.return_value = stats
+
+ res = swh_scheduler_celery_app.send_task("swh.lister.aur.tasks.AurListerTask")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == stats.dict()
+
+ lister.from_configfile.assert_called_once_with()
+ lister.run.assert_called_once_with()

File Metadata

Mime Type
text/plain
Expires
Jul 3 2025, 6:13 PM (4 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222337

Event Timeline