diff --git a/swh/lister/arch/tests/test_lister.py b/swh/lister/arch/tests/test_lister.py --- a/swh/lister/arch/tests/test_lister.py +++ b/swh/lister/arch/tests/test_lister.py @@ -2,6 +2,7 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + from swh.lister.arch.lister import ArchLister expected_origins = [ @@ -1371,7 +1372,7 @@ res = lister.run() assert res.pages == 9 - assert res.origins == 12 + assert res.origins == 11 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results diff --git a/swh/lister/conda/tests/test_lister.py b/swh/lister/conda/tests/test_lister.py --- a/swh/lister/conda/tests/test_lister.py +++ b/swh/lister/conda/tests/test_lister.py @@ -13,7 +13,7 @@ res = lister.run() assert res.pages == 3 - assert res.origins == 14 + assert res.origins == 11 def test_conda_lister_conda_forge_channel( diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py --- a/swh/lister/debian/lister.py +++ b/swh/lister/debian/lister.py @@ -95,12 +95,7 @@ # will hold all listed origins info self.listed_origins: Dict[DebianOrigin, ListedOrigin] = {} - # will contain origin urls that have already been listed - # in a previous page - self.sent_origins: Set[DebianOrigin] = set() - # will contain already listed package info that need to be sent - # to the scheduler for update in the commit_page method - self.origins_to_update: Dict[DebianOrigin, ListedOrigin] = {} + # will contain the lister state after a call to run self.package_versions: Dict[PkgName, Set[PkgVersion]] = {} @@ -185,7 +180,6 @@ assert self.lister_obj.id is not None origins_to_send = {} - self.origins_to_update = {} # iterate on each package source info for src_pkg in page: @@ -228,17 +222,11 @@ extra_loader_arguments={"packages": {}}, last_update=self.last_sources_update, ) - # origin will be yielded at the end of that method - origins_to_send[origin_url] = self.listed_origins[origin_url] # init set that will contain all listed package versions self.package_versions[package_name] = set() - # package has already been listed in a previous page or current page - elif origin_url not in origins_to_send: - # if package has been listed in a previous page, its new versions - # will be added to its ListedOrigin object but the update will - # be sent to the scheduler in the commit_page method - self.origins_to_update[origin_url] = self.listed_origins[origin_url] + # origin will be yielded at the end of that method + origins_to_send[origin_url] = self.listed_origins[origin_url] # update package versions data in parameter that will be provided # to the debian loader @@ -273,20 +261,8 @@ # no new versions so far, no need to send the origin to the scheduler if not new_versions: origins_to_send.pop(origin_url, None) - self.origins_to_update.pop(origin_url, None) - # new versions found, ensure the origin will be sent to the scheduler - elif origin_url not in self.sent_origins: - self.origins_to_update.pop(origin_url, None) - origins_to_send[origin_url] = self.listed_origins[origin_url] - - # update already counted origins with changes since last page - self.sent_origins.update(origins_to_send.keys()) - logger.debug( - "Found %s new packages, %s packages with new versions.", - len(origins_to_send), - len(self.origins_to_update), - ) + logger.debug("Found %s new packages.", len(origins_to_send)) logger.debug( "Current total number of listed packages is equal to %s.", len(self.listed_origins), @@ -294,15 +270,7 @@ yield from origins_to_send.values() - def get_origins_to_update(self) -> Iterator[ListedOrigin]: - yield from self.origins_to_update.values() - - def commit_page(self, page: DebianPageType): - """Send to scheduler already listed origins where new versions have been found - in current page.""" - self.send_origins(self.get_origins_to_update()) - def finalize(self): # set mapping between listed package names and versions as lister state self.state.package_versions = self.package_versions - self.updated = len(self.sent_origins) > 0 + self.updated = len(self.listed_origins) > 0 diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -7,7 +7,7 @@ from dataclasses import dataclass import logging -from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, TypeVar +from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, Set, TypeVar from urllib.parse import urlparse import requests @@ -128,6 +128,8 @@ {"User-Agent": USER_AGENT_TEMPLATE % self.LISTER_NAME} ) + self.recorded_origins: Set[str] = set() + @http_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def http_request(self, url: str, method="GET", **kwargs) -> requests.Response: @@ -154,12 +156,15 @@ """ full_stats = ListerStats() + self.recorded_origins = set() try: for page in self.get_pages(): full_stats.pages += 1 origins = self.get_origins_from_page(page) - full_stats.origins += self.send_origins(origins) + sent_origins = self.send_origins(origins) + self.recorded_origins.update(sent_origins) + full_stats.origins = len(self.recorded_origins) self.commit_page(page) finally: self.finalize() @@ -255,18 +260,18 @@ """ pass - def send_origins(self, origins: Iterable[model.ListedOrigin]) -> int: + def send_origins(self, origins: Iterable[model.ListedOrigin]) -> List[str]: """Record a list of :class:`model.ListedOrigin` in the scheduler. Returns: - the number of listed origins recorded in the scheduler + the list of origin URLs recorded in scheduler database """ - count = 0 + recorded_origins = [] for batch_origins in grouper(origins, n=1000): ret = self.scheduler.record_listed_origins(batch_origins) - count += len(ret) + recorded_origins += [origin.url for origin in ret] - return count + return recorded_origins @classmethod def from_config(cls, scheduler: Dict[str, Any], **config: Any): diff --git a/swh/lister/tests/test_pattern.py b/swh/lister/tests/test_pattern.py --- a/swh/lister/tests/test_pattern.py +++ b/swh/lister/tests/test_pattern.py @@ -198,3 +198,20 @@ # And that all origins are stored check_listed_origins(swh_scheduler, lister, stored_lister) + + +class ListerWithSameOriginInMultiplePages(RunnableStatelessLister): + def get_pages(self) -> Iterator[PageType]: + for _ in range(2): + yield [{"url": "https://example.org/user/project"}] + + +def test_listed_origins_count(swh_scheduler): + lister = ListerWithSameOriginInMultiplePages( + scheduler=swh_scheduler, url="https://example.org", instance="example.org" + ) + + run_result = lister.run() + + assert run_result.pages == 2 + assert run_result.origins == 1