diff --git a/README.md b/README.md index f54483f..4b89ee6 100644 --- a/README.md +++ b/README.md @@ -1,104 +1,105 @@ swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.golang` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` - `swh.lister.gogs` +- `swh.liser.fedora` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, `gitea`, `github`, `gitlab`, `gnu`, `golang`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/mypy.ini b/mypy.ini index 286fec0..7f9436b 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,50 +1,56 @@ [mypy] namespace_packages = True warn_unused_ignores = True # 3rd party libraries without stubs (yet) [mypy-bs4.*] ignore_missing_imports = True [mypy-celery.*] ignore_missing_imports = True [mypy-debian.*] ignore_missing_imports = True [mypy-iso8601.*] ignore_missing_imports = True [mypy-launchpadlib.*] ignore_missing_imports = True [mypy-lazr.*] ignore_missing_imports = True [mypy-lxml.*] ignore_missing_imports = True [mypy-pkg_resources.*] ignore_missing_imports = True [mypy-pytest.*] ignore_missing_imports = True [mypy-pytest_postgresql.*] ignore_missing_imports = True [mypy-requests_mock.*] ignore_missing_imports = True [mypy-urllib3.util.*] ignore_missing_imports = True [mypy-dulwich.*] ignore_missing_imports = True [mypy-testing.postgresql.*] ignore_missing_imports = True [mypy-psycopg2.*] ignore_missing_imports = True + +[mypy-repomd.*] +ignore_missing_imports = True + +[mypy-defusedxml.*] +ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index 17a1e8f..2614f0a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,12 @@ python_debian requests setuptools iso8601 beautifulsoup4 launchpadlib tenacity >= 6.2 lxml dulwich testing.postgresql psycopg2 +repomd diff --git a/setup.py b/setup.py index 7c55f6c..92a0272 100755 --- a/setup.py +++ b/setup.py @@ -1,103 +1,104 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import open from os import path from setuptools import find_packages, setup here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( name="swh.lister", description="Software Heritage lister", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DLSGH/", packages=find_packages(), install_requires=parse_requirements() + parse_requirements("swh"), tests_require=parse_requirements("test"), setup_requires=["setuptools-scm"], extras_require={"testing": parse_requirements("test")}, use_scm_version=True, include_package_data=True, entry_points=""" [swh.cli.subcommands] lister=swh.lister.cli [swh.workers] lister.arch=swh.lister.arch:register lister.aur=swh.lister.aur:register lister.bitbucket=swh.lister.bitbucket:register lister.bower=swh.lister.bower:register lister.cgit=swh.lister.cgit:register lister.conda=swh.lister.conda:register lister.cpan=swh.lister.cpan:register lister.cran=swh.lister.cran:register lister.crates=swh.lister.crates:register lister.debian=swh.lister.debian:register lister.gitea=swh.lister.gitea:register lister.github=swh.lister.github:register lister.gitlab=swh.lister.gitlab:register lister.gnu=swh.lister.gnu:register lister.golang=swh.lister.golang:register lister.hackage=swh.lister.hackage:register lister.launchpad=swh.lister.launchpad:register lister.nixguix=swh.lister.nixguix:register lister.npm=swh.lister.npm:register lister.nuget=swh.lister.nuget:register lister.opam=swh.lister.opam:register lister.packagist=swh.lister.packagist:register lister.phabricator=swh.lister.phabricator:register lister.pubdev=swh.lister.pubdev:register lister.puppet=swh.lister.puppet:register lister.pypi=swh.lister.pypi:register lister.rubygems=swh.lister.rubygems:register lister.sourceforge=swh.lister.sourceforge:register lister.tuleap=swh.lister.tuleap:register lister.maven=swh.lister.maven:register lister.gogs=swh.lister.gogs:register + lister.fedora=swh.lister.fedora:register """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-lister", "Documentation": "https://docs.softwareheritage.org/devel/swh-lister/", }, ) diff --git a/swh/lister/fedora/__init__.py b/swh/lister/fedora/__init__.py new file mode 100644 index 0000000..6fb3a14 --- /dev/null +++ b/swh/lister/fedora/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import FedoraLister + + return { + "lister": FedoraLister, + "task_modules": [f"{__name__}.tasks"], + } diff --git a/swh/lister/fedora/lister.py b/swh/lister/fedora/lister.py new file mode 100644 index 0000000..e5620ff --- /dev/null +++ b/swh/lister/fedora/lister.py @@ -0,0 +1,252 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from dataclasses import dataclass, field +from datetime import datetime, timezone +import logging +from typing import Any, Dict, Iterator, List, Set, Type +from urllib.error import HTTPError +from urllib.parse import urljoin + +import repomd + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import Lister + +logger = logging.getLogger(__name__) + + +Release = int +Edition = str +PkgName = str +PkgVersion = str +FedoraOrigin = str +FedoraPageType = Type[repomd.Repo] +"""Each page is a list of packages from a given Fedora (release, edition) pair""" + + +def get_editions(release: Release) -> List[Edition]: + """Get list of editions for a given release.""" + # Ignore dirs that don't contain .rpm files: + # Docker,CloudImages,Atomic*,Spins,Live,Cloud_Atomic,Silverblue + + if release < 20: + return ["Everything", "Fedora"] + elif release < 28: + return ["Everything", "Server", "Workstation"] + else: + return ["Everything", "Server", "Workstation", "Modular"] + + +def get_last_modified(pkg: repomd.Package) -> datetime: + """Get timezone aware last modified time in UTC from RPM package metadata.""" + ts = pkg._element.find("common:time", namespaces=repomd._ns).get("build") + return datetime.utcfromtimestamp(int(ts)).replace(tzinfo=timezone.utc) + + +def get_checksums(pkg: repomd.Package) -> Dict[str, str]: + """Get checksums associated to rpm archive.""" + cs = pkg._element.find("common:checksum", namespaces=repomd._ns) + cs_type = cs.get("type") + if cs_type == "sha": + cs_type = "sha1" + return {cs_type: cs.text} + + +@dataclass +class FedoraListerState: + """State of Fedora lister""" + + package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict) + """Dictionary mapping a package name to all the versions found during + last listing""" + + +class FedoraLister(Lister[FedoraListerState, FedoraPageType]): + """ + List source packages for given Fedora releases. + + The lister will create a snapshot for each package name from all its + available versions. + + If a package snapshot is different from the last listing operation, + it will be sent to the scheduler that will create a loading task + to archive newly found source code. + + Args: + scheduler: instance of SchedulerInterface + url: fedora package archives mirror URL + releases: list of fedora releases to process + """ + + LISTER_NAME = "fedora" + + def __init__( + self, + scheduler: SchedulerInterface, + instance: str = "fedora", + url: str = "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/", + releases: List[Release] = [34, 35, 36], + ): + super().__init__( + scheduler=scheduler, + url=url, + instance=instance, + credentials={}, + ) + + self.releases = releases + + self.listed_origins: Dict[FedoraOrigin, ListedOrigin] = {} + "will hold all listed origins info" + self.origins_to_send: Set[FedoraOrigin] = set() + "will hold updated origins since last listing" + self.package_versions: Dict[PkgName, Set[PkgVersion]] = {} + "will contain the lister state after a call to run" + self.last_page = False + + def state_from_dict(self, d: Dict[str, Any]) -> FedoraListerState: + return FedoraListerState(package_versions={k: set(v) for k, v in d.items()}) + + def state_to_dict(self, state: FedoraListerState) -> Dict[str, Any]: + return {k: list(v) for k, v in state.package_versions.items()} + + def page_request(self, release: Release, edition: Edition) -> FedoraPageType: + """Return parsed packages for a given fedora release.""" + index_url = urljoin( + self.url, + f"{release}/{edition}/source/SRPMS/" + if release < 24 + else f"{release}/{edition}/source/tree/", + ) + + repo = repomd.load(index_url) # throws error if no repomd.xml is not found + self.last_page = ( + release == self.releases[-1] and edition == get_editions(release)[-1] + ) + + logger.debug( + "Fetched metadata from url: %s, found %d packages", index_url, len(repo) + ) + # TODO: Extract more fields like "provides" and "requires" from *primary.xml + # as extrinsic metadata using the pkg._element.findtext method + return repo + + def get_pages(self) -> Iterator[FedoraPageType]: + """Return an iterator on parsed fedora packages, one page per (release, edition) pair""" + + for release in self.releases: + for edition in get_editions(release): + logger.debug("Listing fedora release %s edition %s", release, edition) + self.current_release = release + self.current_edition = edition + try: + yield self.page_request(release, edition) + except HTTPError as http_error: + if http_error.getcode() == 404: + logger.debug( + "No packages metadata found for fedora release %s edition %s", + release, + edition, + ) + continue + raise + + def origin_url_for_package(self, package_name: PkgName) -> FedoraOrigin: + """Return the origin url for the given package""" + return f"https://src.fedoraproject.org/rpms/{package_name}" + + def get_origins_from_page(self, page: FedoraPageType) -> Iterator[ListedOrigin]: + """Convert a page of fedora package sources into an iterator of ListedOrigin.""" + assert self.lister_obj.id is not None + + origins_to_send = set() + + # iterate on each package's metadata + for pkg_metadata in page: + # extract package metadata + package_name = pkg_metadata.name + package_version = pkg_metadata.version + package_build_time = get_last_modified(pkg_metadata) + package_download_path = pkg_metadata.location + + # build origin url + origin_url = self.origin_url_for_package(package_name) + # create package version key as expected by the fedora (rpm) loader + package_version_key = pkg_metadata.vr + + # this is the first time a package is listed + if origin_url not in self.listed_origins: + # create a ListedOrigin object for it that can be later + # updated with new package versions info + self.listed_origins[origin_url] = ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="rpm", + extra_loader_arguments={"packages": {}}, + last_update=package_build_time, + ) + + # init set that will contain all listed package versions + self.package_versions[package_name] = set() + + # origin will be yielded at the end of that method + origins_to_send.add(origin_url) + + # update package metadata in parameter that will be provided + # to the rpm loader + self.listed_origins[origin_url].extra_loader_arguments["packages"][ + package_version_key + ] = { + "name": package_name, + "version": package_version, + "url": urljoin(page.baseurl, package_download_path), + "release": self.current_release, + "edition": self.current_edition, + "buildTime": package_build_time.isoformat(), + "checksums": get_checksums(pkg_metadata), + } + + last_update = self.listed_origins[origin_url].last_update + if last_update is not None and package_build_time > last_update: + self.listed_origins[origin_url].last_update = package_build_time + + # add package version key to the set of found versions + self.package_versions[package_name].add(package_version_key) + + # package has already been listed during a previous listing process + if package_name in self.state.package_versions: + new_versions = ( + self.package_versions[package_name] + - self.state.package_versions[package_name] + ) + # no new versions so far, no need to send the origin to the scheduler + if not new_versions: + origins_to_send.remove(origin_url) + + logger.debug( + "Found %s packages to update (new ones or packages with new versions).", + len(origins_to_send), + ) + logger.debug( + "Current total number of listed packages is equal to %s.", + len(self.listed_origins), + ) + + # yield from origins_to_send.values() + self.origins_to_send.update(origins_to_send) + + if self.last_page: + # yield listed origins when all fedora releases and editions processed + yield from [ + self.listed_origins[origin_url] for origin_url in self.origins_to_send + ] + + def finalize(self): + # set mapping between listed package names and versions as lister state + self.state.package_versions = self.package_versions + self.updated = len(self.listed_origins) > 0 diff --git a/swh/lister/fedora/tasks.py b/swh/lister/fedora/tasks.py new file mode 100644 index 0000000..18c8a60 --- /dev/null +++ b/swh/lister/fedora/tasks.py @@ -0,0 +1,21 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict + +from celery import shared_task + +from .lister import FedoraLister + + +@shared_task(name=__name__ + ".FullFedoraRelister") +def list_fedora_full(**lister_args) -> Dict[str, int]: + """Full update of a Fedora instance""" + lister = FedoraLister.from_configfile(**lister_args) + return lister.run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping() -> str: + return "OK" diff --git a/swh/lister/fedora/tests/__init__.py b/swh/lister/fedora/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary26.xml.gz b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary26.xml.gz new file mode 100644 index 0000000..0c8eac9 Binary files /dev/null and b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary26.xml.gz differ diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz new file mode 100644 index 0000000..95ea3a0 Binary files /dev/null and b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz differ diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36.xml.gz b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36.xml.gz new file mode 100644 index 0000000..3d9afb7 Binary files /dev/null and b/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36.xml.gz differ diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd26.xml b/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd26.xml new file mode 100644 index 0000000..ab786c1 --- /dev/null +++ b/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd26.xml @@ -0,0 +1,55 @@ + + + 1499286311 + + 4f677623c24912d86848f86837d398979b5adc2a51d9a2170f11fe42a257f3d3 + db616ad8e4219e23dfc05cd515e017cdc0d59144689ac606951fa42cbb06ae65 + + 1499286305 + 5425131 + 30064034 + + + 17296af99a4b80bc67fccabe71ecefa02b76e8409372d936c054b8c9de312b6c + 7caabd1205a72d26422756211dcd536336cef643f7f73eb15a470b02ff09a194 + + 1499286305 + 1650273 + 6419422 + + + 8f1ed139aeaa57f5bc280ce97b82f690e4008c122b4793791ca18e513268b6eb + 786b8d4fa759f0ade3eaab1bde390d12c950dfe217eda1773400f3a3d461522b + + 1499286305 + 4396102 + 33165783 + + + 1d2c0be48c35e55669b410cb4dbe767ae4850b4c610e95ca9aee67f7eb31e457 + dc8dbac072ac1412f0ecface57fa57c5ddcac14acc880fe9b467164be733e963 + + 1499286309 + 7071217 + 26177536 + 10 + + + 5e1259759b9bedefc1ff14b81760524841402776e6c1b33014f4f5d6feb40d11 + b293d51dd4e6eb4128e40b6ce228c62b169b1d47be535e56f69b8ad622c4a6ca + + 1499286307 + 2227395 + 5529600 + 10 + + + f6b30bdfe96d2137542704288de1345c01ea14397eb187126d4474648bad5292 + 3f5d4619dcabe945b773c1c98ea40b8ead53340291bd504ab3faabfc7b57bb99 + + 1499286311 + 5264843 + 27930624 + 10 + + diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd36.xml b/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd36.xml new file mode 100644 index 0000000..12a9a76 --- /dev/null +++ b/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd36.xml @@ -0,0 +1,85 @@ + + + 1651698851 + + 42155056c6d7b1f0e5437bb2a92c48e6d21a02ee8f09acc726e705c26e960a3c + a5841e7086be579d58e2dbb7628caebba32d9defa85739455d518bfaf90e39b0 + + 1651698827 + 7144060 + 45898728 + + + fc915adcdf5710f9f80dfffcec8f03088f09cf80fbc9c801d5a8f45f1f31bb92 + a96a4739268e250e3c3461da716472503ed5ed8b27161fec9a143d4a8ccf5767 + + 1651698827 + 1934835 + 7458268 + + + 461db9fa87e564d75d74c0dfbf006ea5d18ed646d4cb8dee1c69a4d95dd08d09 + 1733c3011a0323fadac711dd25176c9934698176605c3e516b6aabb9b5775e00 + + 1651698827 + 3779969 + 33166564 + + + ac60dd254bfc7557eb646a116bf8083b49fee8e942e1ef50dff7f74004897e74 + c752f5132f2cc5f4f137dade787154316f9503ae816212b8fabf5733cc2d344d + + 1651698851 + 9058624 + 41562112 + 10 + + + 1a279b88531d9c2e24c0bfc9a0d6b4357d70301c24fa42f649c726ed1af1d6a8 + e9b5c17e6004a78d20146aa54fa5ac93a01f4f2a95117588d649e92cfc008473 + + 1651698834 + 1809496 + 6471680 + 10 + + + 850ad17efdebe5f9ccbef03c8aec4e7589bb6a1ca9a6249578968d60ad094a4f + d13c6da8f7ad2c9060fd5b811b86facc9e926ec9273c0e135c4fe1110f784cdc + + 1651698838 + 4285108 + 27897856 + 10 + + + fc4205cf1cca7f0c157d1aa9a1348a1742ca7df671fbf7ccccd79221d473145b + a5841e7086be579d58e2dbb7628caebba32d9defa85739455d518bfaf90e39b0 + 2074f3da25ad0d45cf2776ad35dd22a6c63fafff319143c2f7dfefa98b99d651 + + 1651698828 + 6030441 + 45898728 + 231 + + + 6c77673bb8823bf04fd4520c421fd0fc84567db9f23b8aa19f600b0688e46dd9 + a96a4739268e250e3c3461da716472503ed5ed8b27161fec9a143d4a8ccf5767 + 55fc5e75acd903f01cf18328fec9c6f995bd8f80c5b085aa3e0fe116bb89e891 + + 1651698829 + 1735208 + 7458268 + 136 + + + c87c1b085ef287ba69b1f244d3fff56fc5efc01ffd1d7c10ee22328117651cd5 + 1733c3011a0323fadac711dd25176c9934698176605c3e516b6aabb9b5775e00 + 93624d227c24ff4eb2332fcb038e7157e08ed051b654820def75c5511a1ce191 + + 1651698829 + 3019451 + 33166564 + 206 + + diff --git a/swh/lister/fedora/tests/test_lister.py b/swh/lister/fedora/tests/test_lister.py new file mode 100644 index 0000000..693abe6 --- /dev/null +++ b/swh/lister/fedora/tests/test_lister.py @@ -0,0 +1,231 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from io import StringIO +from pathlib import Path +from typing import List +from unittest.mock import MagicMock +from urllib.error import HTTPError + +import pytest + +from swh.lister.fedora.lister import FedoraLister, Release, get_editions +from swh.scheduler.interface import SchedulerInterface + + +def mock_repomd(datadir, mocker, use_altered_fedora36=False): + """Mocks the .xml files fetched by repomd for the next lister run""" + paths = ["repomd26.xml", "primary26.xml.gz", "repomd36.xml", "primary36.xml.gz"] + if use_altered_fedora36: + paths[3] = "primary36-altered.xml.gz" + + cm = MagicMock() + cm.read.side_effect = [ + Path(datadir, "archives.fedoraproject.org", path).read_bytes() for path in paths + ] + cm.__enter__.return_value = cm + mocker.patch("repomd.urllib.request.urlopen").return_value = cm + + +def rpm_url(release, path): + return ( + "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/" + f"{release}/Everything/source/tree/Packages/{path}" + ) + + +@pytest.fixture +def pkg_versions(): + return { + "https://src.fedoraproject.org/rpms/0install": { + "2.11-4.fc26": { + "name": "0install", + "version": "2.11", + "release": 26, + "edition": "Everything", + "buildTime": "2017-02-10T04:59:31+00:00", + "url": rpm_url(26, "0/0install-2.11-4.fc26.src.rpm"), + "checksums": { + # note: we intentionally altered the original + # primary26.xml file to test sha1 usage + "sha1": "a6fdef5d1026dea208eeeba148f55ac2f545989b", + }, + } + }, + "https://src.fedoraproject.org/rpms/0xFFFF": { + "0.3.9-15.fc26": { + "name": "0xFFFF", + "version": "0.3.9", + "release": 26, + "edition": "Everything", + "buildTime": "2017-02-10T05:01:53+00:00", + "url": rpm_url(26, "0/0xFFFF-0.3.9-15.fc26.src.rpm"), + "checksums": { + "sha256": "96f9c163c0402d2b30e5343c8397a6d50e146c85a446804396b119ef9698231f" + }, + }, + "0.9-4.fc36": { + "name": "0xFFFF", + "version": "0.9", + "release": 36, + "edition": "Everything", + "buildTime": "2022-01-19T19:13:53+00:00", + "url": rpm_url(36, "0/0xFFFF-0.9-4.fc36.src.rpm"), + "checksums": { + "sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd" + }, + }, + }, + "https://src.fedoraproject.org/rpms/2ping": { + "4.5.1-2.fc36": { + "name": "2ping", + "version": "4.5.1", + "release": 36, + "edition": "Everything", + "buildTime": "2022-01-19T19:12:21+00:00", + "url": rpm_url(36, "2/2ping-4.5.1-2.fc36.src.rpm"), + "checksums": { + "sha256": "2ce028d944ebea1cab8c6203c9fed882792478b42fc34682b886a9db16e9de28" + }, + } + }, + } + + +def run_lister( + swh_scheduler: SchedulerInterface, + releases: List[Release], + pkg_versions: dict, + origin_count: int, + updated: bool = True, +): + """Runs the lister and tests that the listed origins are correct.""" + lister = FedoraLister(scheduler=swh_scheduler, releases=releases) + + stats = lister.run() + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + lister_state = lister.get_state_from_scheduler() + state_pkg_versions = {k.split("/")[-1]: set(v) for k, v in pkg_versions.items()} + + # One edition from each release (we mocked get_editions) + assert stats.pages == (len(releases) if updated else 0) + assert stats.origins == origin_count + + assert { + o.url: o.extra_loader_arguments["packages"] for o in scheduler_origins + } == pkg_versions + + assert lister_state.package_versions == state_pkg_versions + assert lister.updated == updated + + +def test_get_editions(): + assert get_editions(18) == ["Everything", "Fedora"] + assert get_editions(26) == ["Everything", "Server", "Workstation"] + assert get_editions(34) == ["Everything", "Server", "Workstation", "Modular"] + + +@pytest.mark.parametrize("status_code", [400, 404, 500]) +def test_fedora_lister_http_error( + swh_scheduler: SchedulerInterface, mocker: MagicMock, status_code: int +): + """ + Simulates handling of HTTP Errors while fetching of packages for fedora releases. + """ + releases = [18] + + is_404 = status_code == 404 + + def side_effect(url): + if is_404: + raise HTTPError( + url, status_code, "Not Found", {"content-type": "text/html"}, StringIO() + ) + else: + raise HTTPError( + url, + status_code, + "Internal server error", + {"content-type": "text/html"}, + StringIO(), + ) + + urlopen_patch = mocker.patch("repomd.urllib.request.urlopen") + urlopen_patch.side_effect = side_effect + + expected_pkgs: dict = {} + + if is_404: + run_lister( + swh_scheduler, releases, expected_pkgs, origin_count=0, updated=False + ) + else: + with pytest.raises(HTTPError): + run_lister( + swh_scheduler, releases, expected_pkgs, origin_count=0, updated=False + ) + + +def test_full_lister_fedora( + swh_scheduler: SchedulerInterface, + mocker: MagicMock, + datadir: Path, + pkg_versions: dict, +): + """ + Simulates a full listing of packages for fedora releases. + """ + releases = [26, 36] + + get_editions_patch = mocker.patch("swh.lister.fedora.lister.get_editions") + get_editions_patch.return_value = ["Everything"] + + mock_repomd(datadir, mocker) + run_lister(swh_scheduler, releases, pkg_versions, origin_count=3) + + +def test_incremental_lister( + swh_scheduler: SchedulerInterface, + mocker: MagicMock, + datadir: Path, + pkg_versions: dict, +): + """ + Simulates an incremental listing of packages for fedora releases. + """ + releases = [26, 36] + + get_editions_patch = mocker.patch("swh.lister.fedora.lister.get_editions") + get_editions_patch.return_value = ["Everything"] + + # First run + mock_repomd(datadir, mocker) + run_lister(swh_scheduler, releases, pkg_versions, origin_count=3) + # Second run (no updates) + mock_repomd(datadir, mocker) + run_lister(swh_scheduler, releases, pkg_versions, origin_count=0) + + # Use an altered version of primary36.xml in which we updated the version + # of package 0xFFFF to 0.10: + mock_repomd(datadir, mocker, use_altered_fedora36=True) + # Add new version to the set of expected pkg versions: + pkg_versions["https://src.fedoraproject.org/rpms/0xFFFF"].update( + { + "0.10-4.fc36": { + "name": "0xFFFF", + "version": "0.10", + "release": 36, + "edition": "Everything", + "buildTime": "2022-01-19T19:13:53+00:00", + "url": rpm_url(36, "0/0xFFFF-0.10-4.fc36.src.rpm"), + "checksums": { + "sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd" + }, + } + } + ) + + # Third run (0xFFFF in fedora36 editions got updated and it needs to be listed) + run_lister(swh_scheduler, releases, pkg_versions, origin_count=1) diff --git a/swh/lister/fedora/tests/test_tasks.py b/swh/lister/fedora/tests/test_tasks.py new file mode 100644 index 0000000..7fd4236 --- /dev/null +++ b/swh/lister/fedora/tests/test_tasks.py @@ -0,0 +1,60 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from unittest.mock import patch + +from swh.lister.pattern import ListerStats + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.fedora.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +@patch("swh.lister.fedora.tasks.FedoraLister") +def test_full_listing(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict( + url="https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/" + ) + res = swh_scheduler_celery_app.send_task( + "swh.lister.fedora.tasks.FullFedoraRelister", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() + + +@patch("swh.lister.fedora.tasks.FedoraLister") +def test_full_listing_params( + lister, swh_scheduler_celery_app, swh_scheduler_celery_worker +): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict( + url="https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/", + instance="archives.fedoraproject.org", + releases=["36"], + ) + res = swh_scheduler_celery_app.send_task( + "swh.lister.fedora.tasks.FullFedoraRelister", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py index a69ec1c..a3f640b 100644 --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -1,63 +1,66 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.lister.cli import SUPPORTED_LISTERS, get_lister lister_args = { "cgit": { "url": "https://git.eclipse.org/c/", }, "phabricator": { "instance": "softwareheritage", "url": "https://forge.softwareheritage.org/api/diffusion.repository.search", "api_token": "bogus", }, "gitea": { "url": "https://try.gitea.io/api/v1/", }, "tuleap": { "url": "https://tuleap.net", }, "gitlab": { "url": "https://gitlab.ow2.org/api/v4", "instance": "ow2", }, "opam": {"url": "https://opam.ocaml.org", "instance": "opam"}, "maven": { "url": "https://repo1.maven.org/maven2/", "index_url": "http://indexes/export.fld", }, "gogs": { "url": "https://try.gogs.io/", "api_token": "secret", }, "nixguix": { "url": "https://guix.gnu.org/sources.json", "origin_upstream": "https://git.savannah.gnu.org/cgit/guix.git/", }, + "fedora": { + "url": "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases//", + }, } def test_get_lister_wrong_input(): """Unsupported lister should raise""" with pytest.raises(ValueError) as e: get_lister("unknown", "db-url") assert "Invalid lister" in str(e.value) def test_get_lister(swh_scheduler_config): """Instantiating a supported lister should be ok""" # Drop launchpad lister from the lister to check, its test setup is more involved # than the other listers and it's not currently done here for lister_name in SUPPORTED_LISTERS: lst = get_lister( lister_name, scheduler={"cls": "local", **swh_scheduler_config}, **lister_args.get(lister_name, {}), ) assert hasattr(lst, "run")