Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/cgit/lister.py
Show First 20 Lines • Show All 59 Lines • ▼ Show 20 Lines | ): | ||||
self.session = requests.Session() | self.session = requests.Session() | ||||
self.session.headers.update( | self.session.headers.update( | ||||
{"Accept": "application/html", "User-Agent": USER_AGENT} | {"Accept": "application/html", "User-Agent": USER_AGENT} | ||||
) | ) | ||||
def _get_and_parse(self, url: str) -> BeautifulSoup: | def _get_and_parse(self, url: str) -> BeautifulSoup: | ||||
"""Get the given url and parse the retrieved HTML using BeautifulSoup""" | """Get the given url and parse the retrieved HTML using BeautifulSoup""" | ||||
response = self.session.get(url) | response = self.session.get(url) | ||||
response.raise_for_status() | if not response.ok: | ||||
return None | |||||
return BeautifulSoup(response.text, features="html.parser") | return BeautifulSoup(response.text, features="html.parser") | ||||
def get_pages(self) -> Iterator[Repositories]: | def get_pages(self) -> Iterator[Repositories]: | ||||
"""Generate git 'project' URLs found on the current CGit server | """Generate git 'project' URLs found on the current CGit server | ||||
The last_update date is retrieved on the list of repo page to avoid | The last_update date is retrieved on the list of repo page to avoid | ||||
to compute it on the repository details which only give a date per branch | to compute it on the repository details which only give a date per branch | ||||
""" | """ | ||||
next_page: Optional[str] = self.url | next_page: Optional[str] = self.url | ||||
while next_page: | while next_page: | ||||
bs_idx = self._get_and_parse(next_page) | bs_idx = self._get_and_parse(next_page) | ||||
assert bs_idx is not None | |||||
page_results = [] | page_results = [] | ||||
for tr in bs_idx.find("div", {"class": "content"}).find_all( | for tr in bs_idx.find("div", {"class": "content"}).find_all( | ||||
"tr", {"class": ""} | "tr", {"class": ""} | ||||
): | ): | ||||
url = urljoin(self.url, tr.find("a")["href"]) | url = urljoin(self.url, tr.find("a")["href"]) | ||||
span = tr.find("span", {"class": re.compile("age-")}) | span = tr.find("span", {"class": re.compile("age-")}) | ||||
if span: | if span: | ||||
Show All 34 Lines | ) -> Iterator[ListedOrigin]: | ||||
url=origin_url, | url=origin_url, | ||||
visit_type="git", | visit_type="git", | ||||
last_update=_parse_last_updated_date(repository), | last_update=_parse_last_updated_date(repository), | ||||
) | ) | ||||
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: | def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: | ||||
"""Extract the git url from the repository page""" | """Extract the git url from the repository page""" | ||||
bs = self._get_and_parse(repository_url) | bs = self._get_and_parse(repository_url) | ||||
if bs is None: | |||||
return None | |||||
# origin urls are listed on the repository page | # origin urls are listed on the repository page | ||||
# TODO check if forcing https is better or not ? | # TODO check if forcing https is better or not ? | ||||
# <link rel='vcs-git' href='git://...' title='...'/> | # <link rel='vcs-git' href='git://...' title='...'/> | ||||
# <link rel='vcs-git' href='http://...' title='...'/> | # <link rel='vcs-git' href='http://...' title='...'/> | ||||
# <link rel='vcs-git' href='https://...' title='...'/> | # <link rel='vcs-git' href='https://...' title='...'/> | ||||
urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] | urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] | ||||
Show All 32 Lines |