Changeset View
Changeset View
Standalone View
Standalone View
listers/sourceforge/sourceforge-ls-projects.py
Show All 15 Lines | |||||
import re | import re | ||||
import requests | import requests | ||||
import xml.etree.ElementTree as ET | import xml.etree.ElementTree as ET | ||||
from pathlib import Path | from pathlib import Path | ||||
SITEMAP_INDEX_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml" | SITEMAP_INDEX_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml" | ||||
PROJ_URL_RE = re.compile("^(https://sourceforge.net/p/)([^/]+)") | PROJ_URL_RE = re.compile("^https://sourceforge.net/([^/]+)/([^/]+)/(.*)") | ||||
CACHE_DIR = Path("~/.cache/swh/sourceforge-lister").expanduser() | CACHE_DIR = Path("~/.cache/swh/sourceforge-lister").expanduser() | ||||
def download(url, use_cache=True): | def download(url, use_cache=True): | ||||
"""URL downloader backed by on-disk cache | """URL downloader backed by on-disk cache | ||||
""" | """ | ||||
cache_name = CACHE_DIR / url.split("/")[-1] | cache_name = CACHE_DIR / url.split("/")[-1] | ||||
Show All 18 Lines | def ls_projects(sitemap_url): | ||||
for map_loc in idx.findall(".//{*}sitemap/{*}loc"): | for map_loc in idx.findall(".//{*}sitemap/{*}loc"): | ||||
map_url = map_loc.text | map_url = map_loc.text | ||||
sub_idx = ET.fromstring(download(map_url)) | sub_idx = ET.fromstring(download(map_url)) | ||||
known_projs = set() # hash()-es of known projects | known_projs = set() # hash()-es of known projects | ||||
for proj in sub_idx.findall(".//{*}url"): | for proj in sub_idx.findall(".//{*}url"): | ||||
proj_url = proj.find("{*}loc").text | proj_url = proj.find("{*}loc").text | ||||
if m := PROJ_URL_RE.match(proj_url): | if m := PROJ_URL_RE.match(proj_url): | ||||
namespace = m.group(1) | |||||
if namespace == "projects": | |||||
# These have a `/p/` counterparts | |||||
continue | |||||
proj_name = m.group(2) # base project url | proj_name = m.group(2) # base project url | ||||
rest = m.group(3) | |||||
if rest.count("/") > 1: | |||||
# This is a subproject | |||||
proj_name = f"{proj_name}/{rest.rsplit('/', 2)[0]}" | |||||
h = hash(proj_name) | h = hash(proj_name) | ||||
if h not in known_projs: | if h not in known_projs: | ||||
known_projs.add(h) | known_projs.add(h) | ||||
yield proj_name | yield f"{namespace}/{proj_name}" | ||||
def main(): | def main(): | ||||
logging.basicConfig(level=logging.INFO) | logging.basicConfig(level=logging.INFO) | ||||
for proj_name in ls_projects(SITEMAP_INDEX_URL): | for proj_name in ls_projects(SITEMAP_INDEX_URL): | ||||
print(proj_name) | print(proj_name) | ||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
main() | main() |