Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/cgit/lister.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||||||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||||||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||||||||
from datetime import datetime, timezone | from datetime import datetime, timezone | ||||||||||
import logging | import logging | ||||||||||
import re | import re | ||||||||||
from typing import Any, Dict, Iterator, List, Optional | from typing import Any, Dict, Iterable, Iterator, List, Optional | ||||||||||
from urllib.parse import urljoin, urlparse | from urllib.parse import urljoin, urlparse | ||||||||||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||||||||
import requests | import requests | ||||||||||
from requests.exceptions import HTTPError | from requests.exceptions import HTTPError | ||||||||||
from swh.core.utils import grouper | |||||||||||
from swh.lister import USER_AGENT | from swh.lister import USER_AGENT | ||||||||||
from swh.lister.pattern import CredentialsType, StatelessLister | from swh.lister.pattern import CredentialsType, StatelessLister | ||||||||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||||||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||||||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||||||
Repositories = List[Dict[str, Any]] | Repositories = List[Dict[str, Any]] | ||||||||||
▲ Show 20 Lines • Show All 162 Lines • ▼ Show 20 Lines | def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: | ||||||||||
if urlparse(url).scheme in ("http", "https"): | if urlparse(url).scheme in ("http", "https"): | ||||||||||
origin_url = url | origin_url = url | ||||||||||
break | break | ||||||||||
else: | else: | ||||||||||
# otherwise, choose the first one | # otherwise, choose the first one | ||||||||||
origin_url = urls[0] | origin_url = urls[0] | ||||||||||
return origin_url | return origin_url | ||||||||||
def send_origins(self, origins: Iterable[ListedOrigin]) -> int: | |||||||||||
"""Record a list of :class:`model.ListedOrigin` in the scheduler. | |||||||||||
Internally, this splits into groups of 100 records prior to flush to the | |||||||||||
scheduler as some pages can be a bit huge. | |||||||||||
Returns: | |||||||||||
the number of listed origins recorded in the scheduler | |||||||||||
""" | |||||||||||
count = 0 | |||||||||||
for batch_origins in grouper(origins, n=100): | |||||||||||
ret = self.scheduler.record_listed_origins(batch_origins) | |||||||||||
vlorentzUnsubmitted Not Done Inline Actions
vlorentz: | |||||||||||
count += len(ret) | |||||||||||
return count | |||||||||||
ardumontAuthorUnsubmitted Done Inline ActionsIt may be worth pushing this in the swh.scheduler.pattern.StatelessLister class. ardumont: It may be worth pushing this in the `swh.scheduler.pattern.StatelessLister` class.
For now… | |||||||||||
ardumontAuthorUnsubmitted Done Inline Actionsnvm that comment (i meant stateless because i misremembered Debian, Launchpad, etc... as being ones but it's not the case.) ardumont: nvm that comment
(i meant stateless because i misremembered Debian, Launchpad, etc... as… | |||||||||||
def _parse_last_updated_date(repository: Dict[str, Any]) -> Optional[datetime]: | def _parse_last_updated_date(repository: Dict[str, Any]) -> Optional[datetime]: | ||||||||||
"""Parse the last updated date""" | """Parse the last updated date""" | ||||||||||
date = repository.get("last_updated_date") | date = repository.get("last_updated_date") | ||||||||||
if not date: | if not date: | ||||||||||
return None | return None | ||||||||||
parsed_date = None | parsed_date = None | ||||||||||
Show All 16 Lines |