diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -5,13 +5,14 @@ from datetime import datetime, timezone import logging import re -from typing import Any, Dict, Iterator, List, Optional +from typing import Any, Dict, Iterable, Iterator, List, Optional from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup import requests from requests.exceptions import HTTPError +from swh.core.utils import grouper from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, StatelessLister from swh.scheduler.interface import SchedulerInterface @@ -190,6 +191,22 @@ origin_url = urls[0] return origin_url + def send_origins(self, origins: Iterable[ListedOrigin]) -> int: + """Record a list of :class:`model.ListedOrigin` in the scheduler. + + Internally, this splits into groups of 100 records prior to flush to the + scheduler as some pages can be a bit huge. + + Returns: + the number of listed origins recorded in the scheduler + + """ + count = 0 + for batch_origins in grouper(origins, n=100): + ret = self.scheduler.record_listed_origins(batch_origins) + count += len(ret) + return count + def _parse_last_updated_date(repository: Dict[str, Any]) -> Optional[datetime]: """Parse the last updated date""" diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -6,12 +6,13 @@ from dataclasses import dataclass from datetime import datetime import logging -from typing import Any, Dict, Iterator, Optional +from typing import Any, Dict, Iterable, Iterator, Optional import iso8601 from launchpadlib.launchpad import Launchpad from lazr.restfulclient.resource import Collection +from swh.core.utils import grouper from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -119,6 +120,22 @@ last_update=last_update, ) + def send_origins(self, origins: Iterable[ListedOrigin]) -> int: + """Record a list of :class:`model.ListedOrigin` in the scheduler. + + Internally, this splits into groups of 100 records prior to flush to the + scheduler as some pages can be a bit huge. + + Returns: + the number of listed origins recorded in the scheduler + + """ + count = 0 + for batch_origins in grouper(origins, n=100): + ret = self.scheduler.record_listed_origins(batch_origins) + count += len(ret) + return count + def finalize(self) -> None: if self.date_last_modified is None: return diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -7,7 +7,6 @@ from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, TypeVar from swh.core.config import load_from_envvar -from swh.core.utils import grouper from swh.scheduler import get_scheduler, model from swh.scheduler.interface import SchedulerInterface @@ -222,12 +221,8 @@ Returns: the number of listed origins recorded in the scheduler """ - count = 0 - for batch_origins in grouper(origins, n=100): - ret = self.scheduler.record_listed_origins(batch_origins) - count += len(ret) - - return count + ret = self.scheduler.record_listed_origins(origins) + return len(ret) @classmethod def from_config(cls, scheduler: Dict[str, Any], **config: Any):