Page MenuHomeSoftware Heritage
Paste P1074

Tarballs archiving script
ActivePublic

Authored by anlambert on Jun 21 2021, 2:02 PM.
import os
from bs4 import BeautifulSoup
import requests
url = "https://www.kermitproject.org/archive.html"
origin_url = url
response = requests.get(url)
page = BeautifulSoup(response.content, features="html.parser")
supported_extensions = {
".tar",
".tar.gz",
".tgz",
".zip",
".tar.Z",
".tar.x",
".tar.lz",
}
archive_links = [
p["href"]
for p in page.find_all("a")
if any([p["href"].endswith(ext) for ext in supported_extensions])
]
artifacts = {}
archives_data = []
for archive_link in archive_links:
if not archive_link.startswith("http"):
archive_link = url + archive_link
if not requests.head(archive_link).ok:
continue
artifact_version = archive_link.split("/")[-1].split(".")[0]
if artifact_version not in artifacts:
archives_data.append(
{"artifact_url": archive_link, "artifact_version": artifact_version}
)
artifacts[artifact_version] = archive_link
save_code_now_url = (
f"https://archive.softwareheritage.org/api/1/origin/save/archives/url/{origin_url}/"
)
headers = {"Authorization": f"Bearer {os.environ['SWH_TOKEN']}"}
print(
requests.post(
save_code_now_url, json={"archives_data": archives_data}, headers=headers
).json()
)

Event Timeline

anlambert updated the paste's language from autodetect to python.