Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/cgit/lister.py
Show All 22 Lines | |||||
class CGitLister(StatelessLister[Repositories]): | class CGitLister(StatelessLister[Repositories]): | ||||
"""Lister class for CGit repositories. | """Lister class for CGit repositories. | ||||
This lister will retrieve the list of published git repositories by | This lister will retrieve the list of published git repositories by | ||||
parsing the HTML page(s) of the index retrieved at `url`. | parsing the HTML page(s) of the index retrieved at `url`. | ||||
For each found git repository, a query is made at the given url found | The lister currently defines 2 listing behaviors: | ||||
in this index to gather published "Clone" URLs to be used as origin | |||||
URL for that git repo. | - If the `base_git_url` is provided, the listed origin urls are computed out of the | ||||
base git url link and the one listed in the main listed page (resulting in less | |||||
anlambert: the listed origin urls are computed out of the base git url and the ones listed in each page… | |||||
HTTP queries than the 2nd behavior below). This is expected to be the main | |||||
deployed behavior. | |||||
- Otherwise (with no `base_git_url`), for each found git repository listed, one | |||||
extra HTTP query is made at the given url found in the main listing page to gather | |||||
published "Clone" URLs to be used as origin URL for that git repo. If several | |||||
"Clone" urls are provided, prefer the http/https one, if any, otherwise fallback | |||||
to the first one. | |||||
If several "Clone" urls are provided, prefer the http/https one, if | |||||
any, otherwise fallback to the first one. | |||||
""" | """ | ||||
LISTER_NAME = "cgit" | LISTER_NAME = "cgit" | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
scheduler: SchedulerInterface, | scheduler: SchedulerInterface, | ||||
url: str, | url: str, | ||||
instance: Optional[str] = None, | instance: Optional[str] = None, | ||||
credentials: Optional[CredentialsType] = None, | credentials: Optional[CredentialsType] = None, | ||||
base_git_url: Optional[str] = None, | |||||
): | ): | ||||
"""Lister class for CGit repositories. | """Lister class for CGit repositories. | ||||
Args: | Args: | ||||
url (str): main URL of the CGit instance, i.e. url of the index | url: main URL of the CGit instance, i.e. url of the index | ||||
of published git repositories on this instance. | of published git repositories on this instance. | ||||
instance (str): Name of cgit instance. Defaults to url's hostname | instance: Name of cgit instance. Defaults to url's hostname | ||||
if unset. | if unset. | ||||
base_git_url: Optional base git url which allows the origin url | |||||
computations. | |||||
""" | """ | ||||
if not instance: | if not instance: | ||||
instance = urlparse(url).hostname | instance = urlparse(url).hostname | ||||
assert instance is not None # Make mypy happy | assert instance is not None # Make mypy happy | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, url=url, instance=instance, credentials=credentials, | scheduler=scheduler, url=url, instance=instance, credentials=credentials, | ||||
) | ) | ||||
self.session = requests.Session() | self.session = requests.Session() | ||||
self.session.headers.update( | self.session.headers.update( | ||||
{"Accept": "application/html", "User-Agent": USER_AGENT} | {"Accept": "application/html", "User-Agent": USER_AGENT} | ||||
) | ) | ||||
self.base_git_url = base_git_url | |||||
def _get_and_parse(self, url: str) -> BeautifulSoup: | def _get_and_parse(self, url: str) -> BeautifulSoup: | ||||
"""Get the given url and parse the retrieved HTML using BeautifulSoup""" | """Get the given url and parse the retrieved HTML using BeautifulSoup""" | ||||
response = self.session.get(url) | response = self.session.get(url) | ||||
response.raise_for_status() | response.raise_for_status() | ||||
return BeautifulSoup(response.text, features="html.parser") | return BeautifulSoup(response.text, features="html.parser") | ||||
def get_pages(self) -> Iterator[Repositories]: | def get_pages(self) -> Iterator[Repositories]: | ||||
"""Generate git 'project' URLs found on the current CGit server | """Generate git 'project' URLs found on the current CGit server | ||||
The last_update date is retrieved on the list of repo page to avoid | The last_update date is retrieved on the list of repo page to avoid | ||||
to compute it on the repository details which only give a date per branch | to compute it on the repository details which only give a date per branch | ||||
""" | """ | ||||
next_page: Optional[str] = self.url | next_page: Optional[str] = self.url | ||||
while next_page: | while next_page: | ||||
bs_idx = self._get_and_parse(next_page) | bs_idx = self._get_and_parse(next_page) | ||||
page_results = [] | page_results = [] | ||||
for tr in bs_idx.find("div", {"class": "content"}).find_all( | for tr in bs_idx.find("div", {"class": "content"}).find_all( | ||||
"tr", {"class": ""} | "tr", {"class": ""} | ||||
): | ): | ||||
url = urljoin(self.url, tr.find("a")["href"]) | repository_link = tr.find("a")["href"] | ||||
repo_url = None | |||||
git_url = None | |||||
base_url = urljoin(self.url, repository_link) | |||||
if self.base_git_url: # mapping provided | |||||
# computing git url | |||||
git_url = base_url.replace(self.url, self.base_git_url) | |||||
else: | |||||
# we compute the git detailed page url from which we will retrieve | |||||
# the git url (cf. self.get_origins_from_page) | |||||
Done Inline ActionsYou could compute repo_url before the if and drop the else block. anlambert: You could compute `repo_url` before the if and drop the else block. | |||||
repo_url = base_url | |||||
span = tr.find("span", {"class": re.compile("age-")}) | span = tr.find("span", {"class": re.compile("age-")}) | ||||
Done Inline ActionsThe urljoin approach will not work for most of cases. For instance with that cgit instance with base URL https://forge.frm2.tum.de/cgit/cgit.cgi/ and base clone URL https://forge.frm2.tum.de/review/ , the computed clone URLs will be in the form https://forge.frm2.tum.de/cgit/cgit.cgi/<path_to_repo>. The approach I was thinking of is the following: repo_url = urljoin(self.url, repository_link) if self.base_git_url: # mapping provided, we compute the git url repo_url = urljoin(self.url, repository_link) git_url = repo_url.replace(self.url, self.base_git_url) logger.debug("%s => %s", repo_url, git_url) This way it is guaranteed to compute the good clone URLs. anlambert: The `urljoin` approach will not work for most of cases. For instance with that cgit instance… | |||||
Done Inline Actionsoh, right. The current implem is not enough and the test assertion needs to be improved. Will adapt. ardumont: oh, right.
The current implem is not enough and the test assertion needs to be improved.
Will… | |||||
if span: | if span: | ||||
last_updated_date = span["title"] | last_updated_date = span["title"] | ||||
else: | else: | ||||
last_updated_date = None | last_updated_date = None | ||||
page_results.append( | page_results.append( | ||||
{"url": url, "last_updated_date": last_updated_date} | { | ||||
"url": repo_url, | |||||
"git_url": git_url, | |||||
"last_updated_date": last_updated_date, | |||||
} | |||||
) | ) | ||||
yield page_results | yield page_results | ||||
try: | try: | ||||
pager = bs_idx.find("ul", {"class": "pager"}) | pager = bs_idx.find("ul", {"class": "pager"}) | ||||
current_page = pager.find("a", {"class": "current"}) | current_page = pager.find("a", {"class": "current"}) | ||||
if current_page: | if current_page: | ||||
next_page = current_page.parent.next_sibling.a["href"] | next_page = current_page.parent.next_sibling.a["href"] | ||||
next_page = urljoin(self.url, next_page) | next_page = urljoin(self.url, next_page) | ||||
except (AttributeError, KeyError): | except (AttributeError, KeyError): | ||||
# no pager, or no next page | # no pager, or no next page | ||||
next_page = None | next_page = None | ||||
def get_origins_from_page( | def get_origins_from_page( | ||||
self, repositories: Repositories | self, repositories: Repositories | ||||
) -> Iterator[ListedOrigin]: | ) -> Iterator[ListedOrigin]: | ||||
"""Convert a page of cgit repositories into a list of ListedOrigins.""" | """Convert a page of cgit repositories into a list of ListedOrigins.""" | ||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
for repository in repositories: | for repo in repositories: | ||||
origin_url = self._get_origin_from_repository_url(repository["url"]) | origin_url = repo["git_url"] or self._get_origin_from_repository_url( | ||||
repo["url"] | |||||
) | |||||
if origin_url is None: | if origin_url is None: | ||||
continue | continue | ||||
yield ListedOrigin( | yield ListedOrigin( | ||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||
url=origin_url, | url=origin_url, | ||||
visit_type="git", | visit_type="git", | ||||
last_update=_parse_last_updated_date(repository), | last_update=_parse_last_updated_date(repo), | ||||
) | ) | ||||
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: | def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: | ||||
"""Extract the git url from the repository page""" | """Extract the git url from the repository page""" | ||||
try: | try: | ||||
bs = self._get_and_parse(repository_url) | bs = self._get_and_parse(repository_url) | ||||
except HTTPError as e: | except HTTPError as e: | ||||
logger.warning( | logger.warning( | ||||
▲ Show 20 Lines • Show All 50 Lines • Show Last 20 Lines |
the listed origin urls are computed out of the base git url and the ones listed in each page (resulting in less
HTTP queries than the 2nd behavior below)