Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/pattern.py
Show All 14 Lines | |||||
from swh.core.config import load_from_envvar | from swh.core.config import load_from_envvar | ||||
from swh.core.github.utils import GitHubSession | from swh.core.github.utils import GitHubSession | ||||
from swh.core.utils import grouper | from swh.core.utils import grouper | ||||
from swh.scheduler import get_scheduler, model | from swh.scheduler import get_scheduler, model | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from . import USER_AGENT_TEMPLATE | from . import USER_AGENT_TEMPLATE | ||||
from .utils import http_retry | from .utils import http_retry, is_valid_origin_url | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
@dataclass | @dataclass | ||||
class ListerStats: | class ListerStats: | ||||
pages: int = 0 | pages: int = 0 | ||||
origins: int = 0 | origins: int = 0 | ||||
▲ Show 20 Lines • Show All 240 Lines • ▼ Show 20 Lines | def commit_page(self, page: PageType) -> None: | ||||
pass | pass | ||||
def send_origins(self, origins: Iterable[model.ListedOrigin]) -> List[str]: | def send_origins(self, origins: Iterable[model.ListedOrigin]) -> List[str]: | ||||
"""Record a list of :class:`model.ListedOrigin` in the scheduler. | """Record a list of :class:`model.ListedOrigin` in the scheduler. | ||||
Returns: | Returns: | ||||
the list of origin URLs recorded in scheduler database | the list of origin URLs recorded in scheduler database | ||||
""" | """ | ||||
valid_origins = [] | |||||
for origin in origins: | |||||
if is_valid_origin_url(origin.url): | |||||
valid_origins.append(origin) | |||||
else: | |||||
logger.warning("Skipping invalid origin: %s", origin.url) | |||||
recorded_origins = [] | recorded_origins = [] | ||||
for batch_origins in grouper(origins, n=1000): | for batch_origins in grouper(valid_origins, n=1000): | ||||
ret = self.scheduler.record_listed_origins(batch_origins) | ret = self.scheduler.record_listed_origins(batch_origins) | ||||
recorded_origins += [origin.url for origin in ret] | recorded_origins += [origin.url for origin in ret] | ||||
return recorded_origins | return recorded_origins | ||||
@classmethod | @classmethod | ||||
def from_config(cls, scheduler: Dict[str, Any], **config: Any): | def from_config(cls, scheduler: Dict[str, Any], **config: Any): | ||||
"""Instantiate a lister from a configuration dict. | """Instantiate a lister from a configuration dict. | ||||
▲ Show 20 Lines • Show All 43 Lines • Show Last 20 Lines |