import os from bs4 import BeautifulSoup import requests url = "https://www.kermitproject.org/archive.html" origin_url = url response = requests.get(url) page = BeautifulSoup(response.content, features="html.parser") supported_extensions = { ".tar", ".tar.gz", ".tgz", ".zip", ".tar.Z", ".tar.x", ".tar.lz", } archive_links = [ p["href"] for p in page.find_all("a") if any([p["href"].endswith(ext) for ext in supported_extensions]) ] artifacts = {} archives_data = [] for archive_link in archive_links: if not archive_link.startswith("http"): archive_link = url + archive_link if not requests.head(archive_link).ok: continue artifact_version = archive_link.split("/")[-1].split(".")[0] if artifact_version not in artifacts: archives_data.append( {"artifact_url": archive_link, "artifact_version": artifact_version} ) artifacts[artifact_version] = archive_link save_code_now_url = ( f"https://archive.softwareheritage.org/api/1/origin/save/archives/url/{origin_url}/" ) headers = {"Authorization": f"Bearer {os.environ['SWH_TOKEN']}"} print( requests.post( save_code_now_url, json={"archives_data": archives_data}, headers=headers ).json() )