diff --git a/listers/sourceforge/sourceforge-ls-projects.py b/listers/sourceforge/sourceforge-ls-projects.py new file mode 100755 index 0000000..ed453cf --- /dev/null +++ b/listers/sourceforge/sourceforge-ls-projects.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 + +"""list all SourceForge projects using sitemap(s) + +Example: + + $ sourceforge-ls-projects.py | sort > sourceforge-projects.txt + +""" + +__copyright__ = "Copyright (C) 2020 Stefano Zacchiroli" +__license__ = "GPL-3.0-or-later" + + +import logging +import re +import requests +import xml.etree.ElementTree as ET + +from pathlib import Path + + +SITEMAP_INDEX_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml" +PROJ_URL_RE = re.compile("^(https://sourceforge.net/p/)([^/]+)") +CACHE_DIR = Path("~/.cache/swh/sourceforge-lister").expanduser() + + +def download(url, use_cache=True): + """URL downloader backed by on-disk cache + + """ + cache_name = CACHE_DIR / url.split("/")[-1] + + if not use_cache or not cache_name.exists(): + logging.info(f"downloading {url} ...") + r = requests.get(url) + CACHE_DIR.mkdir(parents=True, exist_ok=True) + with open(cache_name, "w") as f: + f.write(r.text) + + with open(cache_name) as f: + return f.read() + + +def ls_projects(sitemap_url): + """(download and) parse sitemaps to extract project URLs + + """ + idx = ET.fromstring(download(SITEMAP_INDEX_URL)) + + for map_loc in idx.findall(".//{*}sitemap/{*}loc"): + map_url = map_loc.text + sub_idx = ET.fromstring(download(map_url)) + + known_projs = set() # hash()-es of known projects + for proj in sub_idx.findall(".//{*}url"): + proj_url = proj.find("{*}loc").text + # proj_ts = proj.find("{*}lastmod").text + # proj_freq = proj.find("{*}changefreq").text + # print("\t".join([proj_url, proj_ts, proj_freq]))A + if m := PROJ_URL_RE.match(proj_url): + proj_name = m.group(2) # base project url + h = hash(proj_name) + if h not in known_projs: + known_projs.add(h) + yield proj_name + + +def main(): + logging.basicConfig(level=logging.INFO) + for proj_name in ls_projects(SITEMAP_INDEX_URL): + print(proj_name) + + +if __name__ == "__main__": + main() diff --git a/listers/sourceforge/sourceforge-ls-tools.py b/listers/sourceforge/sourceforge-ls-tools.py new file mode 100755 index 0000000..432b540 --- /dev/null +++ b/listers/sourceforge/sourceforge-ls-tools.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +"""list all SourceForge project "tools", starting from a project list + +output format is a list of records, each consisting of TAB-separated fields: +project_name, tool_name, tool_url + +Example: + + $ shuf sourceforge-projects.txt | sourceforge-ls-tools - > sourceforge-tools.csv + +""" + +__copyright__ = "Copyright (C) 2020 Stefano Zacchiroli" +__license__ = "GPL-3.0-or-later" + + +import click +import logging + +from concurrent.futures import ThreadPoolExecutor +from requests_futures.sessions import FuturesSession +from tqdm import tqdm + + +SF_BASE_URL = "https://sourceforge.net" +REST_URL_PREFIX = "https://sourceforge.net/rest/p/" +WORKERS = 8 + + +def ls_all_tools(projects): + session = FuturesSession(executor=ThreadPoolExecutor(max_workers=WORKERS)) + responses = [] + + for project in projects: # schedule requests + rest_url = REST_URL_PREFIX + project + responses.append((project, session.get(rest_url))) + + for proj_name, res in tqdm(responses): # extract tools from responses + try: + for tool in res.result().json()["tools"]: + print(proj_name, tool["name"], SF_BASE_URL + tool["url"], sep="\t") + except Exception as err: + logging.error(f"cannot list tools for project {proj_name}: {err}") + + +@click.command() +@click.argument("project_list", type=click.File()) +def main(project_list): + logging.basicConfig(level=logging.INFO, filename="sourceforge-tools.log") + ls_all_tools([line.rstrip() for line in project_list]) + + +if __name__ == "__main__": + main()