Page MenuHomeSoftware Heritage
Paste P442

code
ActivePublic

Authored by nahimilega on Jun 19 2019, 11:27 PM.
from bs4 import BeautifulSoup
from collections import defaultdict
import requests
base_url = "https://git.savannah.gnu.org/cgit/"
def forlist(repos):
repo_details = []
for repo in repos:
repo_name = repo.a.text
repo_url = get_url(repo_name, repo)
print(repo_url)
origin_link = find_origin_link(repo_url)
try:
time = repo.span['title']
except Exception:
time = None
print(origin_link)
if origin_link is not None:
repo_details.append({
'name': repo_name,
'time': time,
'desc': repo.find_all('a')[1].text,
'origin_link': origin_link
})
return repo_details
# i.find('a',{"class":"button"})['href']
def find_origin_link(repo_url):
#next_link='https://gitweb.torproject.org/chutney.git/'
response = requests.get(repo_url)
soup = BeautifulSoup(response.text,features="html.parser")
origin_links = find_all_origin_link(soup)
return priority_origin_link(origin_links)
def get_url(repo_name, repo):
try :
suffix = repo.a['href']
(part1, part2, next_url) = self.base_url.split('/', 2)
print(next_url)
return part1 + '//' + next_url + suffix
except Exception:
return base_url + repo_name + '/'
def find_all_origin_link(soup):
origin_links = defaultdict(dict)
found_clone_word = False
for i in soup.find_all('tr'):
if found_clone_word:
link = i.text
protocol = link[:link.find(':')]
origin_links[protocol] = link
if i.text == 'Clone':
found_clone_word = True
return origin_links
def priority_origin_link(origin_links):
for protocol in ['https', 'http', 'git', 'ssh']:
if protocol in origin_links:
return origin_links[protocol]
response = requests.get(base_url)
soup = BeautifulSoup(response.text,
features="html.parser").find('div', {"class": "content"})
repos = soup.find_all("tr", {"class": ""})
forlist(repos)

Event Timeline