diff --git a/swh/lister/fedora/lister.py b/swh/lister/fedora/lister.py index e5620ff..8f3dced 100644 --- a/swh/lister/fedora/lister.py +++ b/swh/lister/fedora/lister.py @@ -1,252 +1,259 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass, field from datetime import datetime, timezone import logging from typing import Any, Dict, Iterator, List, Set, Type from urllib.error import HTTPError from urllib.parse import urljoin import repomd from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import Lister logger = logging.getLogger(__name__) Release = int Edition = str PkgName = str PkgVersion = str FedoraOrigin = str FedoraPageType = Type[repomd.Repo] """Each page is a list of packages from a given Fedora (release, edition) pair""" def get_editions(release: Release) -> List[Edition]: """Get list of editions for a given release.""" # Ignore dirs that don't contain .rpm files: # Docker,CloudImages,Atomic*,Spins,Live,Cloud_Atomic,Silverblue if release < 20: return ["Everything", "Fedora"] elif release < 28: return ["Everything", "Server", "Workstation"] else: return ["Everything", "Server", "Workstation", "Modular"] def get_last_modified(pkg: repomd.Package) -> datetime: """Get timezone aware last modified time in UTC from RPM package metadata.""" ts = pkg._element.find("common:time", namespaces=repomd._ns).get("build") return datetime.utcfromtimestamp(int(ts)).replace(tzinfo=timezone.utc) def get_checksums(pkg: repomd.Package) -> Dict[str, str]: """Get checksums associated to rpm archive.""" cs = pkg._element.find("common:checksum", namespaces=repomd._ns) cs_type = cs.get("type") if cs_type == "sha": cs_type = "sha1" return {cs_type: cs.text} @dataclass class FedoraListerState: """State of Fedora lister""" package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict) """Dictionary mapping a package name to all the versions found during last listing""" class FedoraLister(Lister[FedoraListerState, FedoraPageType]): """ List source packages for given Fedora releases. The lister will create a snapshot for each package name from all its available versions. If a package snapshot is different from the last listing operation, it will be sent to the scheduler that will create a loading task to archive newly found source code. Args: scheduler: instance of SchedulerInterface url: fedora package archives mirror URL releases: list of fedora releases to process """ LISTER_NAME = "fedora" def __init__( self, scheduler: SchedulerInterface, instance: str = "fedora", url: str = "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/", releases: List[Release] = [34, 35, 36], ): super().__init__( scheduler=scheduler, url=url, instance=instance, credentials={}, ) self.releases = releases self.listed_origins: Dict[FedoraOrigin, ListedOrigin] = {} "will hold all listed origins info" self.origins_to_send: Set[FedoraOrigin] = set() "will hold updated origins since last listing" self.package_versions: Dict[PkgName, Set[PkgVersion]] = {} "will contain the lister state after a call to run" self.last_page = False def state_from_dict(self, d: Dict[str, Any]) -> FedoraListerState: return FedoraListerState(package_versions={k: set(v) for k, v in d.items()}) def state_to_dict(self, state: FedoraListerState) -> Dict[str, Any]: return {k: list(v) for k, v in state.package_versions.items()} def page_request(self, release: Release, edition: Edition) -> FedoraPageType: """Return parsed packages for a given fedora release.""" index_url = urljoin( self.url, f"{release}/{edition}/source/SRPMS/" if release < 24 else f"{release}/{edition}/source/tree/", ) repo = repomd.load(index_url) # throws error if no repomd.xml is not found self.last_page = ( release == self.releases[-1] and edition == get_editions(release)[-1] ) logger.debug( "Fetched metadata from url: %s, found %d packages", index_url, len(repo) ) # TODO: Extract more fields like "provides" and "requires" from *primary.xml # as extrinsic metadata using the pkg._element.findtext method return repo def get_pages(self) -> Iterator[FedoraPageType]: """Return an iterator on parsed fedora packages, one page per (release, edition) pair""" for release in self.releases: for edition in get_editions(release): logger.debug("Listing fedora release %s edition %s", release, edition) self.current_release = release self.current_edition = edition try: yield self.page_request(release, edition) except HTTPError as http_error: if http_error.getcode() == 404: logger.debug( "No packages metadata found for fedora release %s edition %s", release, edition, ) continue raise def origin_url_for_package(self, package_name: PkgName) -> FedoraOrigin: """Return the origin url for the given package""" return f"https://src.fedoraproject.org/rpms/{package_name}" def get_origins_from_page(self, page: FedoraPageType) -> Iterator[ListedOrigin]: """Convert a page of fedora package sources into an iterator of ListedOrigin.""" assert self.lister_obj.id is not None origins_to_send = set() # iterate on each package's metadata for pkg_metadata in page: # extract package metadata package_name = pkg_metadata.name - package_version = pkg_metadata.version + package_version = pkg_metadata.vr + package_version_split = package_version.split(".") + if package_version_split[-1].startswith("fc"): + # remove trailing ".fcXY" in version for the rpm loader to avoid + # creating multiple releases targeting same directory + package_version = ".".join(package_version_split[:-1]) + package_build_time = get_last_modified(pkg_metadata) package_download_path = pkg_metadata.location # build origin url origin_url = self.origin_url_for_package(package_name) # create package version key as expected by the fedora (rpm) loader - package_version_key = pkg_metadata.vr + package_version_key = ( + f"fedora{self.current_release}/{self.current_edition}/" + f"{package_version}" + ).lower() # this is the first time a package is listed if origin_url not in self.listed_origins: # create a ListedOrigin object for it that can be later # updated with new package versions info self.listed_origins[origin_url] = ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="rpm", extra_loader_arguments={"packages": {}}, last_update=package_build_time, ) # init set that will contain all listed package versions self.package_versions[package_name] = set() # origin will be yielded at the end of that method origins_to_send.add(origin_url) # update package metadata in parameter that will be provided # to the rpm loader self.listed_origins[origin_url].extra_loader_arguments["packages"][ package_version_key ] = { "name": package_name, "version": package_version, "url": urljoin(page.baseurl, package_download_path), - "release": self.current_release, - "edition": self.current_edition, "buildTime": package_build_time.isoformat(), "checksums": get_checksums(pkg_metadata), } last_update = self.listed_origins[origin_url].last_update if last_update is not None and package_build_time > last_update: self.listed_origins[origin_url].last_update = package_build_time # add package version key to the set of found versions self.package_versions[package_name].add(package_version_key) # package has already been listed during a previous listing process if package_name in self.state.package_versions: new_versions = ( self.package_versions[package_name] - self.state.package_versions[package_name] ) # no new versions so far, no need to send the origin to the scheduler if not new_versions: origins_to_send.remove(origin_url) logger.debug( "Found %s packages to update (new ones or packages with new versions).", len(origins_to_send), ) logger.debug( "Current total number of listed packages is equal to %s.", len(self.listed_origins), ) # yield from origins_to_send.values() self.origins_to_send.update(origins_to_send) if self.last_page: # yield listed origins when all fedora releases and editions processed yield from [ self.listed_origins[origin_url] for origin_url in self.origins_to_send ] def finalize(self): # set mapping between listed package names and versions as lister state self.state.package_versions = self.package_versions self.updated = len(self.listed_origins) > 0 diff --git a/swh/lister/fedora/tests/test_lister.py b/swh/lister/fedora/tests/test_lister.py index 693abe6..dc09359 100644 --- a/swh/lister/fedora/tests/test_lister.py +++ b/swh/lister/fedora/tests/test_lister.py @@ -1,231 +1,221 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import StringIO from pathlib import Path from typing import List from unittest.mock import MagicMock from urllib.error import HTTPError import pytest from swh.lister.fedora.lister import FedoraLister, Release, get_editions from swh.scheduler.interface import SchedulerInterface def mock_repomd(datadir, mocker, use_altered_fedora36=False): """Mocks the .xml files fetched by repomd for the next lister run""" paths = ["repomd26.xml", "primary26.xml.gz", "repomd36.xml", "primary36.xml.gz"] if use_altered_fedora36: paths[3] = "primary36-altered.xml.gz" cm = MagicMock() cm.read.side_effect = [ Path(datadir, "archives.fedoraproject.org", path).read_bytes() for path in paths ] cm.__enter__.return_value = cm mocker.patch("repomd.urllib.request.urlopen").return_value = cm def rpm_url(release, path): return ( "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/" f"{release}/Everything/source/tree/Packages/{path}" ) @pytest.fixture def pkg_versions(): return { "https://src.fedoraproject.org/rpms/0install": { - "2.11-4.fc26": { + "fedora26/everything/2.11-4": { "name": "0install", - "version": "2.11", - "release": 26, - "edition": "Everything", + "version": "2.11-4", "buildTime": "2017-02-10T04:59:31+00:00", "url": rpm_url(26, "0/0install-2.11-4.fc26.src.rpm"), "checksums": { # note: we intentionally altered the original # primary26.xml file to test sha1 usage "sha1": "a6fdef5d1026dea208eeeba148f55ac2f545989b", }, } }, "https://src.fedoraproject.org/rpms/0xFFFF": { - "0.3.9-15.fc26": { + "fedora26/everything/0.3.9-15": { "name": "0xFFFF", - "version": "0.3.9", - "release": 26, - "edition": "Everything", + "version": "0.3.9-15", "buildTime": "2017-02-10T05:01:53+00:00", "url": rpm_url(26, "0/0xFFFF-0.3.9-15.fc26.src.rpm"), "checksums": { "sha256": "96f9c163c0402d2b30e5343c8397a6d50e146c85a446804396b119ef9698231f" }, }, - "0.9-4.fc36": { + "fedora36/everything/0.9-4": { "name": "0xFFFF", - "version": "0.9", - "release": 36, - "edition": "Everything", + "version": "0.9-4", "buildTime": "2022-01-19T19:13:53+00:00", "url": rpm_url(36, "0/0xFFFF-0.9-4.fc36.src.rpm"), "checksums": { "sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd" }, }, }, "https://src.fedoraproject.org/rpms/2ping": { - "4.5.1-2.fc36": { + "fedora36/everything/4.5.1-2": { "name": "2ping", - "version": "4.5.1", - "release": 36, - "edition": "Everything", + "version": "4.5.1-2", "buildTime": "2022-01-19T19:12:21+00:00", "url": rpm_url(36, "2/2ping-4.5.1-2.fc36.src.rpm"), "checksums": { "sha256": "2ce028d944ebea1cab8c6203c9fed882792478b42fc34682b886a9db16e9de28" }, } }, } def run_lister( swh_scheduler: SchedulerInterface, releases: List[Release], pkg_versions: dict, origin_count: int, updated: bool = True, ): """Runs the lister and tests that the listed origins are correct.""" lister = FedoraLister(scheduler=swh_scheduler, releases=releases) stats = lister.run() scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results lister_state = lister.get_state_from_scheduler() state_pkg_versions = {k.split("/")[-1]: set(v) for k, v in pkg_versions.items()} # One edition from each release (we mocked get_editions) assert stats.pages == (len(releases) if updated else 0) assert stats.origins == origin_count assert { o.url: o.extra_loader_arguments["packages"] for o in scheduler_origins } == pkg_versions assert lister_state.package_versions == state_pkg_versions assert lister.updated == updated def test_get_editions(): assert get_editions(18) == ["Everything", "Fedora"] assert get_editions(26) == ["Everything", "Server", "Workstation"] assert get_editions(34) == ["Everything", "Server", "Workstation", "Modular"] @pytest.mark.parametrize("status_code", [400, 404, 500]) def test_fedora_lister_http_error( swh_scheduler: SchedulerInterface, mocker: MagicMock, status_code: int ): """ Simulates handling of HTTP Errors while fetching of packages for fedora releases. """ releases = [18] is_404 = status_code == 404 def side_effect(url): if is_404: raise HTTPError( url, status_code, "Not Found", {"content-type": "text/html"}, StringIO() ) else: raise HTTPError( url, status_code, "Internal server error", {"content-type": "text/html"}, StringIO(), ) urlopen_patch = mocker.patch("repomd.urllib.request.urlopen") urlopen_patch.side_effect = side_effect expected_pkgs: dict = {} if is_404: run_lister( swh_scheduler, releases, expected_pkgs, origin_count=0, updated=False ) else: with pytest.raises(HTTPError): run_lister( swh_scheduler, releases, expected_pkgs, origin_count=0, updated=False ) def test_full_lister_fedora( swh_scheduler: SchedulerInterface, mocker: MagicMock, datadir: Path, pkg_versions: dict, ): """ Simulates a full listing of packages for fedora releases. """ releases = [26, 36] get_editions_patch = mocker.patch("swh.lister.fedora.lister.get_editions") get_editions_patch.return_value = ["Everything"] mock_repomd(datadir, mocker) run_lister(swh_scheduler, releases, pkg_versions, origin_count=3) def test_incremental_lister( swh_scheduler: SchedulerInterface, mocker: MagicMock, datadir: Path, pkg_versions: dict, ): """ Simulates an incremental listing of packages for fedora releases. """ releases = [26, 36] get_editions_patch = mocker.patch("swh.lister.fedora.lister.get_editions") get_editions_patch.return_value = ["Everything"] # First run mock_repomd(datadir, mocker) run_lister(swh_scheduler, releases, pkg_versions, origin_count=3) # Second run (no updates) mock_repomd(datadir, mocker) run_lister(swh_scheduler, releases, pkg_versions, origin_count=0) # Use an altered version of primary36.xml in which we updated the version # of package 0xFFFF to 0.10: mock_repomd(datadir, mocker, use_altered_fedora36=True) # Add new version to the set of expected pkg versions: pkg_versions["https://src.fedoraproject.org/rpms/0xFFFF"].update( { - "0.10-4.fc36": { + "fedora36/everything/0.10-4": { "name": "0xFFFF", - "version": "0.10", - "release": 36, - "edition": "Everything", + "version": "0.10-4", "buildTime": "2022-01-19T19:13:53+00:00", "url": rpm_url(36, "0/0xFFFF-0.10-4.fc36.src.rpm"), "checksums": { "sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd" }, } } ) # Third run (0xFFFF in fedora36 editions got updated and it needs to be listed) run_lister(swh_scheduler, releases, pkg_versions, origin_count=1)