from bs4 import BeautifulSoup from collections import defaultdict import requests base_url = "https://git.savannah.gnu.org/cgit/" def forlist(repos): repo_details = [] for repo in repos: repo_name = repo.a.text repo_url = get_url(repo_name, repo) print(repo_url) origin_link = find_origin_link(repo_url) try: time = repo.span['title'] except Exception: time = None print(origin_link) if origin_link is not None: repo_details.append({ 'name': repo_name, 'time': time, 'desc': repo.find_all('a')[1].text, 'origin_link': origin_link }) return repo_details # i.find('a',{"class":"button"})['href'] def find_origin_link(repo_url): #next_link='https://gitweb.torproject.org/chutney.git/' response = requests.get(repo_url) soup = BeautifulSoup(response.text,features="html.parser") origin_links = find_all_origin_link(soup) return priority_origin_link(origin_links) def get_url(repo_name, repo): try : suffix = repo.a['href'] (part1, part2, next_url) = self.base_url.split('/', 2) print(next_url) return part1 + '//' + next_url + suffix except Exception: return base_url + repo_name + '/' def find_all_origin_link(soup): origin_links = defaultdict(dict) found_clone_word = False for i in soup.find_all('tr'): if found_clone_word: link = i.text protocol = link[:link.find(':')] origin_links[protocol] = link if i.text == 'Clone': found_clone_word = True return origin_links def priority_origin_link(origin_links): for protocol in ['https', 'http', 'git', 'ssh']: if protocol in origin_links: return origin_links[protocol] response = requests.get(base_url) soup = BeautifulSoup(response.text, features="html.parser").find('div', {"class": "content"}) repos = soup.find_all("tr", {"class": ""}) forlist(repos)