diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ - `swh.lister.pypi` - `swh.lister.tuleap` - `swh.lister.gogs` +- `swh.liser.fedora` Dependencies ------------ diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -42,3 +42,9 @@ [mypy-dulwich.*] ignore_missing_imports = True + +[mypy-repomd.*] +ignore_missing_imports = True + +[mypy-defusedxml.*] +ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,5 @@ tenacity >= 6.2 lxml dulwich +repomd +defusedxml diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -75,6 +75,7 @@ lister.tuleap=swh.lister.tuleap:register lister.maven=swh.lister.maven:register lister.gogs=swh.lister.gogs:register + lister.fedora=swh.lister.fedora:register """, classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/lister/fedora/__init__.py b/swh/lister/fedora/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/fedora/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import FedoraLister + + return { + "lister": FedoraLister, + "task_modules": [f"{__name__}.tasks"], + } diff --git a/swh/lister/fedora/lister.py b/swh/lister/fedora/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/fedora/lister.py @@ -0,0 +1,200 @@ +# Copyright (C) 2017-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from dataclasses import dataclass, field +import logging +from typing import Any, Dict, Iterator, List, Optional, Set +from urllib.parse import urljoin + +import repomd + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import CredentialsType, Lister + +logger = logging.getLogger(__name__) + + +Release = str +Component = str +PkgName = str +PkgVersion = str +FedoraOrigin = str +FedoraPageType = Iterator[repomd.Package] +"""Each page is a list of packages from a given Fedora release""" + + +@dataclass +class FedoraListerState: + """State of Fedora lister""" + + package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict) + """Dictionary mapping a package name to all the versions found during + last listing""" + + +class FedoraLister(Lister[FedoraListerState, FedoraPageType]): + """ + List source packages for given Fedora releases. + + The lister will create a snapshot for each package name from all its + available versions. + + If a package snapshot is different from the last listing operation, + it will be sent to the scheduler that will create a loading task + to archive newly found source code. + + Args: + scheduler: instance of SchedulerInterface + url: fedora package archives mirror URL + releases: list of fedora releases to process + """ + + # in the archives,old versions of fedora do not contain repomd.xml + + LISTER_NAME = "fedora" + + def __init__( + self, + scheduler: SchedulerInterface, + instance: str = "fedora", + url: str = "https://eu.edge.kernel.org/fedora/releases/", + releases: Optional[List[Release]] = None, + credentials: Optional[CredentialsType] = None, + ): + super().__init__( + scheduler=scheduler, + url=url, + instance=instance, + credentials=credentials, # TODO: should we remove this + ) + + self.releases = releases or ["34", "35", "36"] + + # will hold all listed origins info + self.listed_origins: Dict[FedoraOrigin, ListedOrigin] = {} + # will contain origin urls that have already been listed + # in a previous page (fedora release) + self.sent_origins: Set[FedoraOrigin] = set() + # will contain already listed package info that need to be sent + # to the scheduler for update in the commit_page method + self.origins_to_update: Dict[FedoraOrigin, ListedOrigin] = {} + # will contain the lister state after a call to run + self.package_versions: Dict[PkgName, Set[PkgVersion]] = {} + + def state_from_dict(self, d: Dict[str, Any]) -> FedoraListerState: + return FedoraListerState(package_versions={k: set(v) for k, v in d.items()}) + + def state_to_dict(self, state: FedoraListerState) -> Dict[str, Any]: + return {k: list(v) for k, v in state.package_versions.items()} + + def page_request(self, release: Release) -> FedoraPageType: + """Return parsed packages for a given fedora release.""" + index_url = urljoin(self.url, f"{release}/Everything/source/tree/") + repo = repomd.load(index_url) # throws error if no repomd.xml is not found + logging.debug( + "Fetched metadata from url: %s, found %d packages", index_url, len(repo) + ) + # TODO: Extract more fields like "provides" and "requires" from *primary.xml + # as extrinsic metadata using the pkg._element.findtext method + for pkg in repo: + yield pkg + + def get_pages(self) -> Iterator[FedoraPageType]: + """Return an iterator on parsed fedora packages, one page per fedora release.""" + for release in self.releases: + logger.debug("Listing fedora release %s", release) + self.current_release = release + yield self.page_request(release) + + def origin_url_for_package(self, package_name: PkgName) -> FedoraOrigin: + """Return the origin url for the given package""" + return f"rpm://{self.instance}/packages/{package_name}" + + def get_origins_from_page(self, page: FedoraPageType) -> Iterator[ListedOrigin]: + """Convert a page of fedora package sources into an iterator of ListedOrigin.""" + assert self.lister_obj.id is not None + + origins_to_send = {} + self.origins_to_update = {} + + # iterate on each package's metadata + for pkg_metadata in page: + # extract package name and version + package_name = pkg_metadata.name + package_version = pkg_metadata.version + package_last_updated = pkg_metadata.build_time + # build origin url + origin_url = self.origin_url_for_package(package_name) + # create package version key as expected by the fedora (rpm) loader + package_version_key = f"{self.current_release}/{package_version}" + + # this is the first time a package is listed + if origin_url not in self.listed_origins: + # create a ListedOrigin object for it that can be later + # updated with new package versions info + self.listed_origins[origin_url] = ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="rpm", + extra_loader_arguments={"packages": {}}, + last_update=package_last_updated, + ) + + # origin will be yielded at the end of that method + origins_to_send[origin_url] = self.listed_origins[origin_url] + # init set that will contain all listed package versions + self.package_versions[package_name] = set() + + # package has already been listed in a previous or current page (release) + elif origin_url not in origins_to_send: + # if package has been listed in previously, its new versions + # will be added to its ListedOrigin object but the update will + # be sent to the scheduler in the commit_page method + self.origins_to_update[origin_url] = self.listed_origins[origin_url] + + # update package versions data in parameter that will be provided + # to the rpm loader + self.listed_origins[origin_url].extra_loader_arguments["packages"].update( + { + package_version_key: { + "name": package_name, + "version": package_version, + } + } + ) + + # add package version key to the set of found versions + self.package_versions[package_name].add(package_version_key) + + # update already counted origins with changes since last page + self.sent_origins.update(origins_to_send.keys()) + + logger.debug( + "Found %s new packages, %s packages with new versions.", + len(origins_to_send), + len(self.origins_to_update), + ) + logger.debug( + "Current total number of listed packages is equal to %s.", + len(self.listed_origins), + ) + + yield from origins_to_send.values() + + def get_origins_to_update(self) -> Iterator[ListedOrigin]: + yield from self.origins_to_update.values() + + def commit_page(self, page: FedoraPageType): + """Send to scheduler already listed origins where new versions have been found + in current page.""" + self.send_origins(self.get_origins_to_update()) + + def finalize(self): + # set mapping between listed package names and versions as lister state + self.state.package_versions = self.package_versions + self.updated = len(self.sent_origins) > 0 diff --git a/swh/lister/fedora/tasks.py b/swh/lister/fedora/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/fedora/tasks.py @@ -0,0 +1,29 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict, List, Optional + +from celery import shared_task + +from .lister import FedoraLister + + +@shared_task(name=__name__ + ".FullFedoraRelister") +def list_fedora_full( + url: str, + instance: Optional[str] = None, + releases: List[str] = None, +) -> Dict[str, int]: + """Full update of a Gogs instance""" + lister = FedoraLister.from_configfile( + url=url, + instance=instance, + releases=releases, + ) + return lister.run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping() -> str: + return "OK" diff --git a/swh/lister/fedora/tests/__init__.py b/swh/lister/fedora/tests/__init__.py new file mode 100644 diff --git a/swh/lister/fedora/tests/data/https_eu.edge.kernel.org_fedora_36/primary.xml b/swh/lister/fedora/tests/data/https_eu.edge.kernel.org_fedora_36/primary.xml new file mode 100644 --- /dev/null +++ b/swh/lister/fedora/tests/data/https_eu.edge.kernel.org_fedora_36/primary.xml @@ -0,0 +1,596 @@ + + + + 0ad + src + + 85fca6fd7a1073da430439b850cb4015eea97cea396ebba163bde4d2f1b72e2e + Cross-Platform RTS Game of Ancient Warfare + 0 A.D. (pronounced "zero ey-dee") is a free, open-source, cross-platform +real-time strategy (RTS) game of ancient warfare. In short, it is a +historically-based war/economy game that allows players to relive or rewrite +the history of Western civilizations, focusing on the years between 500 B.C. +and 500 A.D. The project is highly ambitious, involving state-of-the-art 3D +graphics, detailed artwork, sound, and a flexible and powerful custom-built +game engine. + +The game has been in development by Wildfire Games (WFG), a group of volunteer, +hobbyist game developers, since 2001. + Fedora Project + http://play0ad.com + + + 0ad-data + src + + db142de59b5b2bf33abc6025d4c6be56851ebcbcde7b39c3dc461e2b8992e8c5 + The Data Files for 0 AD + 0 A.D. (pronounced "zero ey-dee") is a free, open-source, cross-platform +real-time strategy (RTS) game of ancient warfare. In short, it is a +historically-based war/economy game that allows players to relive or rewrite +the history of Western civilizations, focusing on the years between 500 B.C. +and 500 A.D. The project is highly ambitious, involving state-of-the-art 3D +graphics, detailed artwork, sound, and a flexible and powerful custom-built +game engine. + +This package contains the 0ad data files. + Fedora Project + http://play0ad.com + + + 0xFFFF + src + + 45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd + The Open Free Fiasco Firmware Flasher + The 'Open Free Fiasco Firmware Flasher' aka 0xFFFF utility implements +a free (GPL3) userspace handler for the NOLO bootloader and related +utilities for the Nokia Internet Tablets like flashing setting device +options, packing/unpacking FIASCO firmware format and more. + Fedora Project + https://talk.maemo.org/showthread.php?t=87996 + + + 2048-cli + src + + 5f77b054ad11aff03bc145616b88c977e67a11333902de776e8ce8769caef7b5 + The game 2048 for your Linux terminal + A cli version of the game 2048 for your Linux terminal. + Fedora Project + https://github.com/Tiehuis/2048-cli + + + 2ping + src + + 2ce028d944ebea1cab8c6203c9fed882792478b42fc34682b886a9db16e9de28 + Bi-directional ping utility + 2ping is a bi-directional ping utility. It uses 3-way pings (akin to TCP SYN, +SYN/ACK, ACK) and after-the-fact state comparison between a 2ping listener and +a 2ping client to determine which direction packet loss occurs. + Fedora Project + https://www.finnie.org/software/2ping + + + 389-ds-base + src + + 00ccfe16be5767bd043a878f0588b19445b8be8d15c477a6ecae331f9485c55e + 389 Directory Server (base) + 389 Directory Server is an LDAPv3 compliant server. The base package includes +the LDAP server and command line utilities for server administration. + Fedora Project + https://www.port389.org + + + 3Depict + src + + 78a0521a39bf634b7d21edccaf26ff5c0ea318b68b620e4a337e1e965b4b2a68 + Valued 3D point cloud visualization and analysis + This software is designed to help users visualize and analyze 3D point clouds +with an associated real value, in a fast and flexible fashion. It is +specifically targeted to atom probe tomography applications, but may be +useful for general scalar valued point data purposes. + Fedora Project + http://threedepict.sourceforge.net + + + 3dprinter-udev-rules + src + + 9deaa03d0744a684eaaa7b3d2cf12b100e53f86b402c7028e68b3378189259fa + Rules for udev to give regular users access to operate 3D printers + Normally, when you connect a RepRap like 3D printer to a Linux machine by an +USB cable, you need to be in dialout or similar group to be able to control +it via OctoPrint, Printrun, Cura or any other control software. Not any more. + +Install this rule to grant all users read and write access to collected +devices based on the VID and PID. + +Disclaimer: Such device might not be a 3D printer, it my be an Arduino, it +might be a modem and it might even be a blender. But normally you would +add your user to dialout and get access to all of those and more anyway. +So I guess be careful when some of the users should not get access to +your blenders. + Fedora Project + https://github.com/hroncok/3dprinter-udev-rules + + + 3mux + src + + 10ed40714c2f4ad243302aa59ea4a40c3bb8a68b9ab78affdac6747a668542b7 + Terminal multiplexer inspired by i3 + +Terminal multiplexer inspired by i3. + Fedora Project + https://github.com/aaronjanse/3mux + + + 3proxy + src + + 9b4a2b6ca029cb43912232f71656af97eee5d173849157d9004c5bbf1b356ebb + Tiny but very powerful proxy + 3proxy -- light proxy server. +Universal proxy server with HTTP, HTTPS, SOCKS v4, SOCKS v4a, SOCKS v5, FTP, +POP3, UDP and TCP portmapping, access control, bandwith control, traffic +limitation and accounting based on username, client IP, target IP, day time, +day of week, etc. + Fedora Project + http://3proxy.ru/?l=EN + + + 4Pane + src + + 3b56a7560662d182a66495160389ff06045ee67ebc7383010695f6c4f6b4e8c7 + Multi-pane, detailed-list file manager + 4Pane is a multi-pane, detailed-list file manager. It is designed +to be fully-featured without bloat, and aims for speed rather than +visual effects. +In addition to standard file manager things, it offers multiple +undo and redo of most operations (including deletions), archive +management including 'virtual browsing' inside archives, multiple +renaming/duplication of files, a terminal emulator and user-defined +tools. + Fedora Project + http://www.4pane.co.uk/ + + + 4diac-forte + src + + 9e4286400a4dcda294fc807202195bc8d4fdaf324d523d8f2c86d2b399528a8f + IEC 61499 runtime environment + The 4DIAC runtime environment (4DIAC-RTE, FORTE) is a small portable +implementation of an IEC 61499 runtime environment targeting small +embedded control devices (16/32 Bit), implemented in C++. It supports +online-reconfiguration of its applications and the real-time capable +execution of all function block types provided by the IEC 61499 standard. + Fedora Project + http://eclipse.org/4diac + + + 4th + src + + 52a78198b2ee889130f79c87f300e2ab64fd4a925fa81e6c55d24c9e0e76e0a8 + A Forth compiler + 4tH is basic framework for creating application specific scripting +languages. It is a library of functions centered around a virtual +machine, which guarantees high performance, ease of use and low overhead. + Fedora Project + https://thebeez.home.xs4all.nl/4tH/ + + + zzuf + src + + d4aa530ea79c034a36aed1a0ef4859f14e9dadbc23e6086018287b9d1236c6fe + Transparent application input fuzzer + zzuf is a transparent application input fuzzer. It works by +intercepting file operations and changing random bits in the program's +input. zzuf's behaviour is deterministic, making it easy to reproduce +bugs. + Fedora Project + http://sam.zoy.org/zzuf/ + + diff --git a/swh/lister/fedora/tests/test_lister.py b/swh/lister/fedora/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/fedora/tests/test_lister.py @@ -0,0 +1,60 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from pathlib import Path +from unittest.mock import MagicMock + +import defusedxml +import pytest +from repomd import Repo +from requests_mock import Mocker + +from swh.lister.fedora.lister import FedoraLister +from swh.scheduler.interface import SchedulerInterface + +mirror_url = "https://eu.edge.kernel.org/fedora/releases" + + +@pytest.fixture +def repomd_mock(requests_mock, datadir, mocker): + primary_xml = Path(datadir, "https_eu.edge.kernel.org_fedora_36", "primary.xml") + metadata = defusedxml.lxml.fromstring(primary_xml.read_bytes()) + + repomd_mock = mocker.patch("swh.lister.fedora.lister.repomd") + repomd_mock.load.return_value = Repo(mirror_url, metadata) + + return repomd_mock + + +def test_full_lister_fedora( + swh_scheduler: SchedulerInterface, + repomd_mock: MagicMock, + requests_mock: Mocker, +): + """ + Simulate a full listing of packages for fedora releases. + """ + + lister = FedoraLister( + scheduler=swh_scheduler, + url=mirror_url, + releases=[ + "35", + "36", + ], # here both repomd requests will return the same 14 packages + ) + + stats = lister.run() + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert stats.pages == 2 # == no. of requested releases + assert stats.origins == 14 + assert len(scheduler_origins) == 14 + + stats = lister.run() + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert stats.pages == 2 + assert stats.origins == 0 + assert len(scheduler_origins) == 14 diff --git a/swh/lister/fedora/tests/test_tasks.py b/swh/lister/fedora/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/fedora/tests/test_tasks.py @@ -0,0 +1,60 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from unittest.mock import patch + +from swh.lister.pattern import ListerStats + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.fedora.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +@patch("swh.lister.fedora.tasks.FedoraLister") +def test_full_listing(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict(url="https://eu.edge.kernel.org/fedora/releases/") + res = swh_scheduler_celery_app.send_task( + "swh.lister.fedora.tasks.FullFedoraRelister", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + actual_kwargs = dict(**kwargs, instance=None, releases=None) + + lister.from_configfile.assert_called_once_with(**actual_kwargs) + lister.run.assert_called_once_with() + + +@patch("swh.lister.fedora.tasks.FedoraLister") +def test_full_listing_params( + lister, swh_scheduler_celery_app, swh_scheduler_celery_worker +): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict( + url="https://eu.edge.kernel.org/fedora/releases/", + instance="eu.edge.kernel", + releases=["36"], + ) + res = swh_scheduler_celery_app.send_task( + "swh.lister.fedora.tasks.FullFedoraRelister", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -35,6 +35,9 @@ "url": "https://try.gogs.io/", "api_token": "secret", }, + "fedora": { + "url": "https://eu.edge.kernel.org/fedora/releases/", + }, }