Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/cgit/lister.py
# Copyright (C) 2019 the Software Heritage developers | # Copyright (C) 2019 the Software Heritage developers | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import re | import re | ||||
import logging | |||||
from urllib.parse import urlparse, urljoin | from urllib.parse import urlparse, urljoin | ||||
from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||
from requests import Session | from requests import Session | ||||
from requests.adapters import HTTPAdapter | from requests.adapters import HTTPAdapter | ||||
from .models import CGitModel | from .models import CGitModel | ||||
from swh.core.utils import grouper | from swh.core.utils import grouper | ||||
from swh.lister.core.lister_base import ListerBase | from swh.lister.core.lister_base import ListerBase | ||||
logger = logging.getLogger(__name__) | |||||
class CGitLister(ListerBase): | class CGitLister(ListerBase): | ||||
"""Lister class for CGit repositories. | """Lister class for CGit repositories. | ||||
This lister will retrieve the list of published git repositories by | This lister will retrieve the list of published git repositories by | ||||
parsing the HTML page(s) of the index retrieved at `url`. | parsing the HTML page(s) of the index retrieved at `url`. | ||||
For each found git repository, a query is made at the given url found | For each found git repository, a query is made at the given url found | ||||
in this index to gather published "Clone" URLs to be used as origin | in this index to gather published "Clone" URLs to be used as origin | ||||
Show All 39 Lines | def __init__(self, url=None, instance=None, override_config=None): | ||||
if not instance: | if not instance: | ||||
instance = urlparse(url).hostname | instance = urlparse(url).hostname | ||||
self.instance = instance | self.instance = instance | ||||
self.session = Session() | self.session = Session() | ||||
self.session.mount(self.url, HTTPAdapter(max_retries=3)) | self.session.mount(self.url, HTTPAdapter(max_retries=3)) | ||||
def run(self): | def run(self): | ||||
for repos in grouper(self.get_repos(), 100): | total = 0 | ||||
for repos in grouper(self.get_repos(), 10): | |||||
models = list(filter(None, (self.build_model(repo) | models = list(filter(None, (self.build_model(repo) | ||||
for repo in repos))) | for repo in repos))) | ||||
injected_repos = self.inject_repo_data_into_db(models) | injected_repos = self.inject_repo_data_into_db(models) | ||||
self.schedule_missing_tasks(models, injected_repos) | self.schedule_missing_tasks(models, injected_repos) | ||||
self.db_session.commit() | self.db_session.commit() | ||||
total += len(injected_repos) | |||||
logger.debug('Scheduled %s tasks for %s', total, self.url) | |||||
def get_repos(self): | def get_repos(self): | ||||
"""Generate git 'project' URLs found on the current CGit server | """Generate git 'project' URLs found on the current CGit server | ||||
""" | """ | ||||
next_page = self.url | next_page = self.url | ||||
while next_page: | while next_page: | ||||
bs_idx = self.get_and_parse(next_page) | bs_idx = self.get_and_parse(next_page) | ||||
▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines |