diff --git a/listers/sourceforge/sourceforge-ls-projects.py b/listers/sourceforge/sourceforge-ls-projects.py index 8064dc2..7d6510f 100755 --- a/listers/sourceforge/sourceforge-ls-projects.py +++ b/listers/sourceforge/sourceforge-ls-projects.py @@ -1,73 +1,82 @@ #!/usr/bin/env python3 """list all SourceForge projects using sitemap(s) Example: $ sourceforge-ls-projects.py | sort > sourceforge-projects.txt """ __copyright__ = "Copyright (C) 2020 Stefano Zacchiroli" __license__ = "GPL-3.0-or-later" import logging import re import requests import xml.etree.ElementTree as ET from pathlib import Path SITEMAP_INDEX_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml" -PROJ_URL_RE = re.compile("^(https://sourceforge.net/p/)([^/]+)") +PROJ_URL_RE = re.compile("^https://sourceforge.net/([^/]+)/([^/]+)/(.*)") CACHE_DIR = Path("~/.cache/swh/sourceforge-lister").expanduser() def download(url, use_cache=True): """URL downloader backed by on-disk cache """ cache_name = CACHE_DIR / url.split("/")[-1] if not use_cache or not cache_name.exists(): logging.info(f"downloading {url} ...") r = requests.get(url) CACHE_DIR.mkdir(parents=True, exist_ok=True) with open(cache_name, "w") as f: f.write(r.text) with open(cache_name) as f: return f.read() def ls_projects(sitemap_url): """(download and) parse sitemaps to extract project URLs """ idx = ET.fromstring(download(SITEMAP_INDEX_URL)) for map_loc in idx.findall(".//{*}sitemap/{*}loc"): map_url = map_loc.text sub_idx = ET.fromstring(download(map_url)) known_projs = set() # hash()-es of known projects for proj in sub_idx.findall(".//{*}url"): proj_url = proj.find("{*}loc").text if m := PROJ_URL_RE.match(proj_url): + namespace = m.group(1) + if namespace == "projects": + # These have a `/p/` counterparts + continue proj_name = m.group(2) # base project url + rest = m.group(3) + if rest.count("/") > 1: + # This is a subproject + proj_name = f"{proj_name}/{rest.rsplit('/', 2)[0]}" + h = hash(proj_name) if h not in known_projs: known_projs.add(h) - yield proj_name + yield f"{namespace}/{proj_name}" def main(): logging.basicConfig(level=logging.INFO) for proj_name in ls_projects(SITEMAP_INDEX_URL): print(proj_name) if __name__ == "__main__": main() diff --git a/listers/sourceforge/sourceforge-ls-tools.py b/listers/sourceforge/sourceforge-ls-tools.py index 432b540..325fee1 100755 --- a/listers/sourceforge/sourceforge-ls-tools.py +++ b/listers/sourceforge/sourceforge-ls-tools.py @@ -1,55 +1,55 @@ #!/usr/bin/env python3 -"""list all SourceForge project "tools", starting from a project list +"""list all SourceForge project "tools", starting from a namespaced project list output format is a list of records, each consisting of TAB-separated fields: -project_name, tool_name, tool_url +namespaced_project_name, tool_name, tool_url Example: $ shuf sourceforge-projects.txt | sourceforge-ls-tools - > sourceforge-tools.csv """ __copyright__ = "Copyright (C) 2020 Stefano Zacchiroli" __license__ = "GPL-3.0-or-later" import click import logging from concurrent.futures import ThreadPoolExecutor from requests_futures.sessions import FuturesSession from tqdm import tqdm SF_BASE_URL = "https://sourceforge.net" -REST_URL_PREFIX = "https://sourceforge.net/rest/p/" +REST_URL_PREFIX = "https://sourceforge.net/rest/" WORKERS = 8 def ls_all_tools(projects): session = FuturesSession(executor=ThreadPoolExecutor(max_workers=WORKERS)) responses = [] - for project in projects: # schedule requests - rest_url = REST_URL_PREFIX + project - responses.append((project, session.get(rest_url))) + for namespaced_project in projects: # schedule requests + rest_url = REST_URL_PREFIX + namespaced_project + responses.append((namespaced_project, session.get(rest_url))) for proj_name, res in tqdm(responses): # extract tools from responses try: for tool in res.result().json()["tools"]: print(proj_name, tool["name"], SF_BASE_URL + tool["url"], sep="\t") except Exception as err: logging.error(f"cannot list tools for project {proj_name}: {err}") @click.command() @click.argument("project_list", type=click.File()) def main(project_list): logging.basicConfig(level=logging.INFO, filename="sourceforge-tools.log") ls_all_tools([line.rstrip() for line in project_list]) if __name__ == "__main__": main()