import os

from bs4 import BeautifulSoup
import requests

url = "https://www.kermitproject.org/archive.html"
origin_url = url

response = requests.get(url)

page = BeautifulSoup(response.content, features="html.parser")

supported_extensions = {
    ".tar",
    ".tar.gz",
    ".tgz",
    ".zip",
    ".tar.Z",
    ".tar.x",
    ".tar.lz",
}

archive_links = [
    p["href"]
    for p in page.find_all("a")
    if any([p["href"].endswith(ext) for ext in supported_extensions])
]

artifacts = {}

archives_data = []

for archive_link in archive_links:
    if not archive_link.startswith("http"):
        archive_link = url + archive_link
    if not requests.head(archive_link).ok:
        continue
    artifact_version = archive_link.split("/")[-1].split(".")[0]
    if artifact_version not in artifacts:
        archives_data.append(
            {"artifact_url": archive_link, "artifact_version": artifact_version}
        )
        artifacts[artifact_version] = archive_link

save_code_now_url = (
    f"https://archive.softwareheritage.org/api/1/origin/save/archives/url/{origin_url}/"
)

headers = {"Authorization": f"Bearer {os.environ['SWH_TOKEN']}"}

print(
    requests.post(
        save_code_now_url, json={"archives_data": archives_data}, headers=headers
    ).json()
)