diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ - `swh.lister.pypi` - `swh.lister.tuleap` - `swh.lister.gogs` +- `swh.liser.fedora` Dependencies ------------ diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -42,3 +42,9 @@ [mypy-dulwich.*] ignore_missing_imports = True + +[mypy-repomd.*] +ignore_missing_imports = True + +[mypy-defusedxml.*] +ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ tenacity >= 6.2 lxml dulwich +repomd diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -75,6 +75,7 @@ lister.tuleap=swh.lister.tuleap:register lister.maven=swh.lister.maven:register lister.gogs=swh.lister.gogs:register + lister.fedora=swh.lister.fedora:register """, classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/lister/fedora/__init__.py b/swh/lister/fedora/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/fedora/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import FedoraLister + + return { + "lister": FedoraLister, + "task_modules": [f"{__name__}.tasks"], + } diff --git a/swh/lister/fedora/lister.py b/swh/lister/fedora/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/fedora/lister.py @@ -0,0 +1,224 @@ +# Copyright (C) 2017-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from dataclasses import dataclass, field +import logging +from typing import Any, Dict, Iterator, List, Set, Type +from urllib.parse import urljoin + +import repomd + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import Lister + +logger = logging.getLogger(__name__) + + +Release = int +Edition = str +PkgName = str +PkgVersion = str +FedoraOrigin = str +FedoraPageType = Type[repomd.Repo] +"""Each page is a list of packages from a given Fedora (release, edition) pair""" + + +def get_editions(release: Release) -> List[Edition]: + """Get list of editions for a given release.""" + if release < 20: + return ["Everything", "Fedora"] + elif release < 28: + return ["Everything", "Server", "Workstation"] + else: + return ["Everything", "Server", "Workstation", "Modular"] + + +@dataclass +class FedoraListerState: + """State of Fedora lister""" + + package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict) + """Dictionary mapping a package name to all the versions found during + last listing""" + + +class FedoraLister(Lister[FedoraListerState, FedoraPageType]): + """ + List source packages for given Fedora releases. + + The lister will create a snapshot for each package name from all its + available versions. + + If a package snapshot is different from the last listing operation, + it will be sent to the scheduler that will create a loading task + to archive newly found source code. + + Args: + scheduler: instance of SchedulerInterface + url: fedora package archives mirror URL + releases: list of fedora releases to process + """ + + # in the archives,old versions of fedora do not contain repomd.xml + + LISTER_NAME = "fedora" + + def __init__( + self, + scheduler: SchedulerInterface, + instance: str = "fedora", + url: str = "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/", + releases: List[Release] = [34, 35, 36], + ): + super().__init__( + scheduler=scheduler, + url=url, + instance=instance, + credentials={}, + ) + + self.releases = releases + + self.listed_origins: Dict[FedoraOrigin, ListedOrigin] = {} + "will hold all listed origins info" + self.sent_origins: Set[FedoraOrigin] = set() + """will contain origin urls that have already been listed + in a previous page (fedora release)""" + self.origins_to_update: Dict[FedoraOrigin, ListedOrigin] = {} + """will contain already listed package info that need to be + sent to the scheduler for update in the commit_page method""" + self.package_versions: Dict[PkgName, Set[PkgVersion]] = {} + "will contain the lister state after a call to run" + + def state_from_dict(self, d: Dict[str, Any]) -> FedoraListerState: + return FedoraListerState(package_versions={k: set(v) for k, v in d.items()}) + + def state_to_dict(self, state: FedoraListerState) -> Dict[str, Any]: + return {k: list(v) for k, v in state.package_versions.items()} + + def page_request(self, release: Release, edition: Edition) -> FedoraPageType: + """Return parsed packages for a given fedora release.""" + index_url = urljoin( + self.url, + f"{release}/{edition}/source/SRPMS/" + if release < 24 + else f"{release}/Everything/source/tree/", + ) + + repo = repomd.load(index_url) # throws error if no repomd.xml is not found + logging.debug( + "Fetched metadata from url: %s, found %d packages", index_url, len(repo) + ) + # TODO: Extract more fields like "provides" and "requires" from *primary.xml + # as extrinsic metadata using the pkg._element.findtext method + return repo + + def get_pages(self) -> Iterator[FedoraPageType]: + """Return an iterator on parsed fedora packages, one page per (release, edition) pair""" + # Dirs that don't contain .rpm files: + # Docker,CloudImages,Atomic*,Spins,Live,Cloud_Atomic,Silverblue + + for release in self.releases: + for edition in get_editions(release): + logger.debug("Listing fedora release %s edition %s", release, edition) + self.current_release = release + self.current_edition = edition + yield self.page_request(release, edition) + + def origin_url_for_package(self, package_name: PkgName) -> FedoraOrigin: + """Return the origin url for the given package""" + return f"rpm://{self.instance}/packages/{package_name}" + + def get_origins_from_page(self, page: FedoraPageType) -> Iterator[ListedOrigin]: + """Convert a page of fedora package sources into an iterator of ListedOrigin.""" + assert self.lister_obj.id is not None + + origins_to_send = {} + self.origins_to_update = {} + + # iterate on each package's metadata + for pkg_metadata in page: + # extract package metadata + package_name = pkg_metadata.name + package_version = pkg_metadata.version + package_last_updated = pkg_metadata.build_time + package_download_path = pkg_metadata.location + + # build origin url + origin_url = self.origin_url_for_package(package_name) + # create package version key as expected by the fedora (rpm) loader + # TODO: could use pkg.release instead of self.current_release + package_version_key = ( + f"{self.current_release}/{self.current_edition}/{package_version}" + ) + + # this is the first time a package is listed + if origin_url not in self.listed_origins: + # create a ListedOrigin object for it that can be later + # updated with new package versions info + self.listed_origins[origin_url] = ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="rpm", + extra_loader_arguments={"packages": {}}, + last_update=package_last_updated, + ) + + # origin will be yielded at the end of that method + origins_to_send[origin_url] = self.listed_origins[origin_url] + # init set that will contain all listed package versions + self.package_versions[package_name] = set() + + # package has already been listed in a previous or current page (release) + elif origin_url not in origins_to_send: + # if package has been listed in previously, its new versions + # will be added to its ListedOrigin object but the update will + # be sent to the scheduler in the commit_page method + self.origins_to_update[origin_url] = self.listed_origins[origin_url] + + # update package versions data in parameter that will be provided + # to the rpm loader + self.listed_origins[origin_url].extra_loader_arguments["packages"][ + package_version_key + ] = { + "name": package_name, + "version": package_version, + "url": urljoin(page.baseurl, package_download_path), + "release": self.current_release, + } + + # add package version key to the set of found versions + self.package_versions[package_name].add(package_version_key) + + # update already counted origins with changes since last page + self.sent_origins.update(origins_to_send.keys()) + + logger.debug( + "Found %s new packages, %s packages with new versions.", + len(origins_to_send), + len(self.origins_to_update), + ) + logger.debug( + "Current total number of listed packages is equal to %s.", + len(self.listed_origins), + ) + + yield from origins_to_send.values() + + def get_origins_to_update(self) -> Iterator[ListedOrigin]: + yield from self.origins_to_update.values() + + def commit_page(self, page: FedoraPageType): + """Send to scheduler already listed origins where new versions have been found + in current page.""" + self.send_origins(self.get_origins_to_update()) + + def finalize(self): + # set mapping between listed package names and versions as lister state + self.state.package_versions = self.package_versions + self.updated = len(self.sent_origins) > 0 diff --git a/swh/lister/fedora/tasks.py b/swh/lister/fedora/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/fedora/tasks.py @@ -0,0 +1,29 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict, List, Optional + +from celery import shared_task + +from .lister import FedoraLister + + +@shared_task(name=__name__ + ".FullFedoraRelister") +def list_fedora_full( + url: str, + instance: Optional[str] = None, + releases: List[str] = None, +) -> Dict[str, int]: + """Full update of a Fedora instance""" + lister = FedoraLister.from_configfile( + url=url, + instance=instance, + releases=releases, + ) + return lister.run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping() -> str: + return "OK" diff --git a/swh/lister/fedora/tests/__init__.py b/swh/lister/fedora/tests/__init__.py new file mode 100644 diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary26.xml b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary26.xml new file mode 100644 --- /dev/null +++ b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary26.xml @@ -0,0 +1,102 @@ + + + + 0install + src + + 79ace4e9dc84500aea0e3e5ea878dc3fae17a0fd3a07daf3e109066d514be68e + A decentralized cross-distribution software installation system + Zero Install is a decentralized cross-distribution software +installation system available under the LGPL. It allows software +developers to publish programs directly from their own web-sites, +while supporting features familiar from centralized distribution +repositories such as shared libraries, automatic updates and digital +signatures. It is intended to complement, rather than replace, the +operating system's package management. 0install packages never +interfere with those provided by the distribution. + +0install does not define a new packaging format; unmodified tarballs +or zip archives can be used. Instead, it defines an XML metadata +format to describe these packages and the dependencies between them. A +single metadata file can be used on multiple platforms (e.g. Ubuntu, +Debian, Fedora, openSUSE, Mac OS X and Windows), assuming binary or +source archives are available that work on those systems. + +0install also has some interesting features not often found in +traditional package managers. For example, while it will share +libraries whenever possible, it can always install multiple versions +of a package in parallel when there are conflicting +requirements. Installation is always side-effect-free (each package is +unpacked to its own directory and will not touch shared directories +such as /usr/bin), making it ideal for use with sandboxing +technologies and virtualization. + +The XML file describing the program's requirements can also be +included in a source-code repository, allowing full dependency +handling for unreleased developer versions. For example, a user can +clone a Git repository and build and test the program, automatically +downloading newer versions of libraries where necessary, without +interfering with the versions of those libraries installed by their +distribution, which continue to be used for other software. + Fedora Project + http://0install.net + + + 0xFFFF + src + + 96f9c163c0402d2b30e5343c8397a6d50e146c85a446804396b119ef9698231f + The Open Free Fiasco Firmware Flasher + The 'Open Free Fiasco Firmware Flasher' aka 0xFFFF utility implements +a free (GPL3) userspace handler for the NOLO bootloader and related +utilities for the Nokia Internet Tablets like flashing setting device +options, packing/unpacking FIASCO firmware format and more. + Fedora Project + http://www.nopcode.org/0xFFFF/ + + \ No newline at end of file diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36.xml b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36.xml new file mode 100644 --- /dev/null +++ b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36.xml @@ -0,0 +1,67 @@ + + + + 0xFFFF + src + + 45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd + The Open Free Fiasco Firmware Flasher + The 'Open Free Fiasco Firmware Flasher' aka 0xFFFF utility implements +a free (GPL3) userspace handler for the NOLO bootloader and related +utilities for the Nokia Internet Tablets like flashing setting device +options, packing/unpacking FIASCO firmware format and more. + Fedora Project + https://talk.maemo.org/showthread.php?t=87996 + + + 2ping + src + + 2ce028d944ebea1cab8c6203c9fed882792478b42fc34682b886a9db16e9de28 + Bi-directional ping utility + 2ping is a bi-directional ping utility. It uses 3-way pings (akin to TCP SYN, +SYN/ACK, ACK) and after-the-fact state comparison between a 2ping listener and +a 2ping client to determine which direction packet loss occurs. + Fedora Project + https://www.finnie.org/software/2ping + + diff --git a/swh/lister/fedora/tests/test_lister.py b/swh/lister/fedora/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/fedora/tests/test_lister.py @@ -0,0 +1,175 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from pathlib import Path +from typing import List, Tuple +from unittest.mock import MagicMock + +import defusedxml +import pytest +from repomd import Repo + +from swh.lister.fedora.lister import FedoraLister, Release, get_editions +from swh.scheduler.interface import SchedulerInterface + + +def get_repo(datadir: Path, release: int) -> Repo: + """Returns a repomd Repo object for the given release.""" + primary_xml = Path(datadir, "archives.fedoraproject.org", f"primary{release}.xml") + metadata = defusedxml.lxml.fromstring(primary_xml.read_bytes()) + return Repo(f"mocked.mirror.url/releases/{release}", metadata) + + +RepomdMock = Tuple[Repo, Repo] + + +@pytest.fixture +def repomd_mock(datadir, mocker, requests_mock) -> RepomdMock: + """Mocks repomd.load function and returns a list of packages.""" + fedora26 = get_repo(datadir, 26) + fedora36 = get_repo(datadir, 36) + + repomd_mock = mocker.patch("swh.lister.fedora.lister.repomd") + + def side_effect(url): + if "36" in url: + return fedora36 + elif "26" in url: + return fedora26 + else: + raise Exception(f"Unexpected url: {url}") + + repomd_mock.load.side_effect = side_effect + + return fedora26, fedora36 + + +_pkg_versions = { + "rpm://fedora/packages/0install": { + "26/Everything/2.11": { + "name": "0install", + "version": "2.11", + "release": 26, + "url": "mocked.mirror.url/releases/Packages/0/0install-2.11-4.fc26.src.rpm", + } + }, + "rpm://fedora/packages/0xFFFF": { + "36/Everything/0.9": { + "name": "0xFFFF", + "version": "0.9", + "release": 36, + "url": "mocked.mirror.url/releases/Packages/0/0xFFFF-0.9-4.fc36.src.rpm", + }, + "26/Everything/0.3.9": { + "name": "0xFFFF", + "version": "0.3.9", + "release": 26, + "url": "mocked.mirror.url/releases/Packages/0/0xFFFF-0.3.9-15.fc26.src.rpm", + }, + }, + "rpm://fedora/packages/2ping": { + "36/Everything/4.5.1": { + "name": "2ping", + "version": "4.5.1", + "release": 36, + "url": "mocked.mirror.url/releases/Packages/2/2ping-4.5.1-2.fc36.src.rpm", + } + }, +} + + +def run_lister( + lister: FedoraLister, + swh_scheduler: SchedulerInterface, + releases: List[Release], + pkg_versions: dict, + origin_count: int, +): + """Runs the lister and tests that the listed origins are correct.""" + stats = lister.run() + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + # One edition from each release (we mocked get_editions) + assert stats.pages == len(releases) + assert stats.origins == origin_count + + assert { + o.url: o.extra_loader_arguments["packages"] for o in scheduler_origins + } == pkg_versions + + lister_state = lister.get_state_from_scheduler() + assert lister_state.package_versions == { + k.split("/")[-1]: set(v) for k, v in pkg_versions.items() + } + assert lister.updated + + +def test_get_editions(): + assert get_editions(18) == ["Everything", "Fedora"] + assert get_editions(26) == ["Everything", "Server", "Workstation"] + assert get_editions(34) == ["Everything", "Server", "Workstation", "Modular"] + + +def test_full_lister_fedora( + swh_scheduler: SchedulerInterface, + repomd_mock: RepomdMock, + mocker: MagicMock, +): + """ + Simulates a full listing of packages for fedora releases. + """ + releases = [26, 36] + + lister = FedoraLister( + scheduler=swh_scheduler, + releases=releases, + ) + get_editions_patch = mocker.patch("swh.lister.fedora.lister.get_editions") + get_editions_patch.return_value = ["Everything"] + + pkg_versions = _pkg_versions.copy() + run_lister(lister, swh_scheduler, releases, pkg_versions, origin_count=3) + + +def test_incremental_lister( + swh_scheduler: SchedulerInterface, repomd_mock: RepomdMock, mocker: MagicMock +): + """ + Simulates an incremental listing of packages for fedora releases. + """ + releases = [26, 36] + lister = FedoraLister( + scheduler=swh_scheduler, + releases=releases, + ) + get_editions_patch = mocker.patch("swh.lister.fedora.lister.get_editions") + get_editions_patch.return_value = ["Everything"] + + pkg_versions = _pkg_versions.copy() + + # First run + run_lister(lister, swh_scheduler, releases, pkg_versions, origin_count=3) + # Second run (no updates) + run_lister(lister, swh_scheduler, releases, pkg_versions, origin_count=0) + + # Update version of the package 0xFFFF in Fedora 36: + _, fedora36 = repomd_mock + for pkg in fedora36: + if pkg.name == "0xFFFF": + pkg._version_info.set("ver", "0.10") + # Add new version to the set of expected pkg versions: + pkg_versions["rpm://fedora/packages/0xFFFF"].update( + { + "36/Everything/0.10": { + "name": "0xFFFF", + "version": "0.10", + "release": 36, + # .rpm URL remains same: + "url": "mocked.mirror.url/releases/Packages/0/0xFFFF-0.9-4.fc36.src.rpm", + } + } + ) + + # Third run (0xFFFF in fedora36 editions got updated, but no new origins were found) + run_lister(lister, swh_scheduler, releases, pkg_versions, origin_count=0) diff --git a/swh/lister/fedora/tests/test_tasks.py b/swh/lister/fedora/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/fedora/tests/test_tasks.py @@ -0,0 +1,60 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from unittest.mock import patch + +from swh.lister.pattern import ListerStats + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.fedora.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +@patch("swh.lister.fedora.tasks.FedoraLister") +def test_full_listing(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict(url="https://eu.edge.kernel.org/fedora/releases/") + res = swh_scheduler_celery_app.send_task( + "swh.lister.fedora.tasks.FullFedoraRelister", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + actual_kwargs = dict(**kwargs, instance=None, releases=None) + + lister.from_configfile.assert_called_once_with(**actual_kwargs) + lister.run.assert_called_once_with() + + +@patch("swh.lister.fedora.tasks.FedoraLister") +def test_full_listing_params( + lister, swh_scheduler_celery_app, swh_scheduler_celery_worker +): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict( + url="https://eu.edge.kernel.org/fedora/releases/", + instance="eu.edge.kernel", + releases=["36"], + ) + res = swh_scheduler_celery_app.send_task( + "swh.lister.fedora.tasks.FullFedoraRelister", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -35,6 +35,9 @@ "url": "https://try.gogs.io/", "api_token": "secret", }, + "fedora": { + "url": "https://eu.edge.kernel.org/fedora/releases/", + }, }