Changeset View
Standalone View
swh/lister/fedora/lister.py
- This file was added.
# Copyright (C) 2017-2021 The Software Heritage developers | ||||||||||||||||||
anlambert: Copyright (C) 2022 | ||||||||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | ||||||||||||||||||
# License: GNU General Public License version 3, or any later version | ||||||||||||||||||
# See top-level LICENSE file for more information | ||||||||||||||||||
from dataclasses import dataclass, field | ||||||||||||||||||
import logging | ||||||||||||||||||
from typing import Any, Dict, Iterator, List, Optional, Set | ||||||||||||||||||
from urllib.parse import urljoin | ||||||||||||||||||
import repomd | ||||||||||||||||||
import requests | ||||||||||||||||||
from swh.scheduler.interface import SchedulerInterface | ||||||||||||||||||
from swh.scheduler.model import ListedOrigin | ||||||||||||||||||
from .. import USER_AGENT | ||||||||||||||||||
from ..pattern import CredentialsType, Lister | ||||||||||||||||||
logger = logging.getLogger(__name__) | ||||||||||||||||||
Release = str | ||||||||||||||||||
Component = str | ||||||||||||||||||
PkgName = str | ||||||||||||||||||
PkgVersion = str | ||||||||||||||||||
DebianOrigin = str | ||||||||||||||||||
FedoraPageType = Iterator[repomd.Package] | ||||||||||||||||||
@dataclass | ||||||||||||||||||
class FedoraListerState: | ||||||||||||||||||
"""State of debian lister""" | ||||||||||||||||||
package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict) | ||||||||||||||||||
"""Dictionary mapping a package name to all the versions found during | ||||||||||||||||||
last listing""" | ||||||||||||||||||
class FedoraLister(Lister[FedoraListerState, FedoraPageType]): | ||||||||||||||||||
""" | ||||||||||||||||||
List source packages for a given debian or derivative distribution. | ||||||||||||||||||
The lister will create a snapshot for each package name from all its | ||||||||||||||||||
available versions. | ||||||||||||||||||
If a package snapshot is different from the last listing operation, | ||||||||||||||||||
it will be send to the scheduler that will create a loading task | ||||||||||||||||||
to archive newly found source code. | ||||||||||||||||||
Args: | ||||||||||||||||||
scheduler: instance of SchedulerInterface | ||||||||||||||||||
url: fedora package archives mirror URL | ||||||||||||||||||
releases: list of fedora releases to process | ||||||||||||||||||
""" | ||||||||||||||||||
LISTER_NAME = "debian" | ||||||||||||||||||
def __init__( | ||||||||||||||||||
self, | ||||||||||||||||||
scheduler: SchedulerInterface, | ||||||||||||||||||
instance: str = "fedora", | ||||||||||||||||||
url: str = "https://eu.edge.kernel.org/fedora/releases/", | ||||||||||||||||||
releases: List[Release] = None, | ||||||||||||||||||
credentials: Optional[CredentialsType] = None, | ||||||||||||||||||
): | ||||||||||||||||||
super().__init__( | ||||||||||||||||||
scheduler=scheduler, | ||||||||||||||||||
url=url, | ||||||||||||||||||
instance=instance, | ||||||||||||||||||
credentials=credentials, | ||||||||||||||||||
) | ||||||||||||||||||
Not Done Inline Actions
vlorentz: | ||||||||||||||||||
# to ensure urljoin will produce valid Sources URL | ||||||||||||||||||
if not self.url.endswith("/"): | ||||||||||||||||||
Not Done Inline Actionsmove the default list to the parameters so it is part of the doc (like the Debian lister) vlorentz: move the default list to the parameters so it is part of the doc (like the Debian lister) | ||||||||||||||||||
Done Inline Actionsdone. but won't that make the default value mutable? (anti-pattern) KShivendu: done.
but won't that make the default value mutable? (anti-pattern) | ||||||||||||||||||
self.url += "/" | ||||||||||||||||||
self.releases = releases if releases is not None else ["34", "35", "36"] | ||||||||||||||||||
self.session = requests.Session() | ||||||||||||||||||
self.session.headers.update({"User-Agent": USER_AGENT}) | ||||||||||||||||||
# will hold all listed origins info | ||||||||||||||||||
self.listed_origins: Dict[DebianOrigin, ListedOrigin] = {} | ||||||||||||||||||
# will contain origin urls that have already been listed | ||||||||||||||||||
# in a previous page | ||||||||||||||||||
Done Inline Actionsuse docstrings so they can be documented by Sphinx vlorentz: use docstrings so they can be documented by Sphinx | ||||||||||||||||||
Done Inline Actionsgood idea! KShivendu: good idea! | ||||||||||||||||||
self.sent_origins: Set[DebianOrigin] = set() | ||||||||||||||||||
# will contain already listed package info that need to be sent | ||||||||||||||||||
# to the scheduler for update in the commit_page method | ||||||||||||||||||
self.origins_to_update: Dict[DebianOrigin, ListedOrigin] = {} | ||||||||||||||||||
# will contain the lister state after a call to run | ||||||||||||||||||
self.package_versions: Dict[PkgName, Set[PkgVersion]] = {} | ||||||||||||||||||
def state_from_dict(self, d: Dict[str, Any]) -> FedoraListerState: | ||||||||||||||||||
return FedoraListerState(package_versions={k: set(v) for k, v in d.items()}) | ||||||||||||||||||
def state_to_dict(self, state: FedoraListerState) -> Dict[str, Any]: | ||||||||||||||||||
return {k: list(v) for k, v in state.package_versions.items()} | ||||||||||||||||||
def page_request(self, release: Release) -> Iterator[repomd.Package]: | ||||||||||||||||||
Not Done Inline ActionsYou can remove these instance variables. I guess you inspire from the debian lister to implement the fedora one anlambert: You can remove these instance variables. I guess you inspire from the debian lister to… | ||||||||||||||||||
"""Return parsed package Sources file for a given debian suite and component.""" | ||||||||||||||||||
index_url = urljoin(self.url, f"{release}/Everything/source/tree/") | ||||||||||||||||||
repo = repomd.load(index_url) | ||||||||||||||||||
# logging.debug("Fetched URL: %s, status code: %s", url, response.status_code) | ||||||||||||||||||
for pkg in repo: | ||||||||||||||||||
yield pkg | ||||||||||||||||||
def get_pages(self) -> Iterator[FedoraPageType]: | ||||||||||||||||||
"""Return an iterator on parsed debian package Sources files, one per fedora release.""" | ||||||||||||||||||
for release in self.releases: | ||||||||||||||||||
logger.debug("Listing fedora release %s", release) | ||||||||||||||||||
self.current_release = release | ||||||||||||||||||
yield self.page_request(release) | ||||||||||||||||||
def origin_url_for_package(self, package_name: PkgName) -> DebianOrigin: | ||||||||||||||||||
Not Done Inline ActionsThat's consistent with the Debian lister, but IIRC using deb:// has some issues . @olasd Does this rind a bell? vlorentz: That's consistent with the Debian lister, but IIRC using `deb://` has some issues . @olasd Does… | ||||||||||||||||||
"""Return the origin url for the given package""" | ||||||||||||||||||
Not Done Inline Actionselse f"{release}/{edition}/source/tree/", anlambert: ```lang=python
else f"{release}/{edition}/source/tree/",
``` | ||||||||||||||||||
return f"rpm://{self.instance}/packages/{package_name}" | ||||||||||||||||||
def get_origins_from_page(self, page: FedoraPageType) -> Iterator[ListedOrigin]: | ||||||||||||||||||
"""Convert a page of fedora package sources into an iterator of ListedOrigin.""" | ||||||||||||||||||
Not Done Inline Actionss/logging/logger/ anlambert: s/logging/logger/ | ||||||||||||||||||
assert self.lister_obj.id is not None | ||||||||||||||||||
origins_to_send = {} | ||||||||||||||||||
self.origins_to_update = {} | ||||||||||||||||||
# iterate on each package source info | ||||||||||||||||||
for src_pkg in page: | ||||||||||||||||||
# extract package name and version | ||||||||||||||||||
package_name = src_pkg.name | ||||||||||||||||||
package_version = src_pkg.version | ||||||||||||||||||
package_last_updated = src_pkg.build_time | ||||||||||||||||||
# build origin url | ||||||||||||||||||
origin_url = self.origin_url_for_package( | ||||||||||||||||||
package_name | ||||||||||||||||||
) # TODO: Should include fedora release version? | ||||||||||||||||||
Not Done Inline ActionsYou should handle 404 errors here as it exists cases where no packages metadata are available for a given fedora release and edition.
try: yield self.page_request(release, edition) except HTTPError as http_error: if http_error.getcode() == 404: logger.debug( "No packages metadata found for fedora release %s edition %s", release, edition, ) continue raise anlambert: You should handle 404 errors here as it exists cases where no packages metadata are available… | ||||||||||||||||||
# create package version key as expected by the debian loader | ||||||||||||||||||
package_version_key = f"{self.current_release}/{package_version}" | ||||||||||||||||||
# this is the first time a package is listed | ||||||||||||||||||
Not Done Inline ActionsI think it is better to use a real HTTPS origin URL in the form https://packages.fedoraproject.org/pkgs/{package_name} instead of such URL with non standard scheme. anlambert: I think it is better to use a real HTTPS origin URL in the form `https://packages.fedoraproject. | ||||||||||||||||||
if origin_url not in self.listed_origins: | ||||||||||||||||||
# create a ListedOrigin object for it that can be later | ||||||||||||||||||
# updated with new package versions info | ||||||||||||||||||
self.listed_origins[origin_url] = ListedOrigin( | ||||||||||||||||||
lister_id=self.lister_obj.id, | ||||||||||||||||||
url=origin_url, | ||||||||||||||||||
visit_type="rpm", | ||||||||||||||||||
Not Done Inline Actionsto remove anlambert: to remove | ||||||||||||||||||
extra_loader_arguments={"packages": {}}, | ||||||||||||||||||
last_update=package_last_updated, | ||||||||||||||||||
) | ||||||||||||||||||
# origin will be yielded at the end of that method | ||||||||||||||||||
origins_to_send[origin_url] = self.listed_origins[origin_url] | ||||||||||||||||||
# init set that will contain all listed package versions | ||||||||||||||||||
self.package_versions[package_name] = set() | ||||||||||||||||||
# package has already been listed in a previous page or current page | ||||||||||||||||||
elif origin_url not in origins_to_send: | ||||||||||||||||||
# if package has been listed in a previous page, its new versions | ||||||||||||||||||
# will be added to its ListedOrigin object but the update will | ||||||||||||||||||
# be sent to the scheduler in the commit_page method | ||||||||||||||||||
self.origins_to_update[origin_url] = self.listed_origins[origin_url] | ||||||||||||||||||
# update package versions data in parameter that will be provided | ||||||||||||||||||
# to the debian loader | ||||||||||||||||||
self.listed_origins[origin_url].extra_loader_arguments["packages"].update( | ||||||||||||||||||
{ | ||||||||||||||||||
package_version_key: { | ||||||||||||||||||
"name": package_name, | ||||||||||||||||||
"version": package_version, | ||||||||||||||||||
Done Inline Actions
more idiomatic IMO vlorentz: more idiomatic IMO | ||||||||||||||||||
# "files": files, | ||||||||||||||||||
} | ||||||||||||||||||
} | ||||||||||||||||||
) | ||||||||||||||||||
Not Done Inline ActionsYou should set the last_update parameter of ListedOrigin constructor to package_build_time here. anlambert: You should set the `last_update` parameter of `ListedOrigin` constructor to… | ||||||||||||||||||
# add package version key to the set of found versions | ||||||||||||||||||
self.package_versions[package_name].add(package_version_key) | ||||||||||||||||||
# update already counted origins with changes since last page | ||||||||||||||||||
self.sent_origins.update(origins_to_send.keys()) | ||||||||||||||||||
logger.debug( | ||||||||||||||||||
"Found %s new packages, %s packages with new versions.", | ||||||||||||||||||
len(origins_to_send), | ||||||||||||||||||
len(self.origins_to_update), | ||||||||||||||||||
) | ||||||||||||||||||
logger.debug( | ||||||||||||||||||
Not Done Inline Actionsreplace this block with: origins_to_send[origin_url] = self.listed_origins[origin_url] anlambert: replace this block with:
```lang=python
origins_to_send[origin_url] = self.listed_origins… | ||||||||||||||||||
"Current total number of listed packages is equal to %s.", | ||||||||||||||||||
len(self.listed_origins), | ||||||||||||||||||
) | ||||||||||||||||||
yield from origins_to_send.values() | ||||||||||||||||||
def get_origins_to_update(self) -> Iterator[ListedOrigin]: | ||||||||||||||||||
yield from self.origins_to_update.values() | ||||||||||||||||||
def commit_page(self, page: FedoraPageType): | ||||||||||||||||||
"""Send to scheduler already listed origins where new versions have been found | ||||||||||||||||||
in current page.""" | ||||||||||||||||||
self.send_origins(self.get_origins_to_update()) | ||||||||||||||||||
def finalize(self): | ||||||||||||||||||
# set mapping between listed package names and versions as lister state | ||||||||||||||||||
self.state.package_versions = self.package_versions | ||||||||||||||||||
Not Done Inline ActionsAfter updating package versions, you should also update the last_update value of associated ListedOrigin: if package_build_time > self.listed_origins[origin_url].last_update: self.listed_origins[origin_url].last_update = package_build_time As in the debian lister, you can also discard ListedOrigin instances with no new versions to be sent to the scheduler: # package has already been listed during a previous listing process if package_name in self.state.package_versions: new_versions = ( self.package_versions[package_name] - self.state.package_versions[package_name] ) # no new versions so far, no need to send the origin to the scheduler if not new_versions: origins_to_send.pop(origin_url, None) anlambert: After updating package versions, you should also update the `last_update` value of associated… | ||||||||||||||||||
self.updated = len(self.sent_origins) > 0 | ||||||||||||||||||
Not Done Inline Actionsto remove anlambert: to remove | ||||||||||||||||||
Not Done Inline ActionsReplace this debug log with: logger.debug( "Found %s packages to update (new ones or packages with new versions).", len(origins_to_send), ) anlambert: Replace this debug log with:
```lang=python
logger.debug(
"Found %s packages to update… | ||||||||||||||||||
Not Done Inline Actionsto remove anlambert: to remove | ||||||||||||||||||
Not Done Inline Actionsself.updated = len(self.listed_origins) > 0 anlambert: ```lang=python
self.updated = len(self.listed_origins) > 0
``` |
Copyright (C) 2022