Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/listers/sourceforge/sourceforge-ls-projects.py b/listers/sourceforge/sourceforge-ls-projects.py
new file mode 100755
index 0000000..ed453cf
--- /dev/null
+++ b/listers/sourceforge/sourceforge-ls-projects.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+
+"""list all SourceForge projects using sitemap(s)
+
+Example:
+
+ $ sourceforge-ls-projects.py | sort > sourceforge-projects.txt
+
+"""
+
+__copyright__ = "Copyright (C) 2020 Stefano Zacchiroli"
+__license__ = "GPL-3.0-or-later"
+
+
+import logging
+import re
+import requests
+import xml.etree.ElementTree as ET
+
+from pathlib import Path
+
+
+SITEMAP_INDEX_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml"
+PROJ_URL_RE = re.compile("^(https://sourceforge.net/p/)([^/]+)")
+CACHE_DIR = Path("~/.cache/swh/sourceforge-lister").expanduser()
+
+
+def download(url, use_cache=True):
+ """URL downloader backed by on-disk cache
+
+ """
+ cache_name = CACHE_DIR / url.split("/")[-1]
+
+ if not use_cache or not cache_name.exists():
+ logging.info(f"downloading {url} ...")
+ r = requests.get(url)
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
+ with open(cache_name, "w") as f:
+ f.write(r.text)
+
+ with open(cache_name) as f:
+ return f.read()
+
+
+def ls_projects(sitemap_url):
+ """(download and) parse sitemaps to extract project URLs
+
+ """
+ idx = ET.fromstring(download(SITEMAP_INDEX_URL))
+
+ for map_loc in idx.findall(".//{*}sitemap/{*}loc"):
+ map_url = map_loc.text
+ sub_idx = ET.fromstring(download(map_url))
+
+ known_projs = set() # hash()-es of known projects
+ for proj in sub_idx.findall(".//{*}url"):
+ proj_url = proj.find("{*}loc").text
+ # proj_ts = proj.find("{*}lastmod").text
+ # proj_freq = proj.find("{*}changefreq").text
+ # print("\t".join([proj_url, proj_ts, proj_freq]))A
+ if m := PROJ_URL_RE.match(proj_url):
+ proj_name = m.group(2) # base project url
+ h = hash(proj_name)
+ if h not in known_projs:
+ known_projs.add(h)
+ yield proj_name
+
+
+def main():
+ logging.basicConfig(level=logging.INFO)
+ for proj_name in ls_projects(SITEMAP_INDEX_URL):
+ print(proj_name)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/listers/sourceforge/sourceforge-ls-tools.py b/listers/sourceforge/sourceforge-ls-tools.py
new file mode 100755
index 0000000..432b540
--- /dev/null
+++ b/listers/sourceforge/sourceforge-ls-tools.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+"""list all SourceForge project "tools", starting from a project list
+
+output format is a list of records, each consisting of TAB-separated fields:
+project_name, tool_name, tool_url
+
+Example:
+
+ $ shuf sourceforge-projects.txt | sourceforge-ls-tools - > sourceforge-tools.csv
+
+"""
+
+__copyright__ = "Copyright (C) 2020 Stefano Zacchiroli"
+__license__ = "GPL-3.0-or-later"
+
+
+import click
+import logging
+
+from concurrent.futures import ThreadPoolExecutor
+from requests_futures.sessions import FuturesSession
+from tqdm import tqdm
+
+
+SF_BASE_URL = "https://sourceforge.net"
+REST_URL_PREFIX = "https://sourceforge.net/rest/p/"
+WORKERS = 8
+
+
+def ls_all_tools(projects):
+ session = FuturesSession(executor=ThreadPoolExecutor(max_workers=WORKERS))
+ responses = []
+
+ for project in projects: # schedule requests
+ rest_url = REST_URL_PREFIX + project
+ responses.append((project, session.get(rest_url)))
+
+ for proj_name, res in tqdm(responses): # extract tools from responses
+ try:
+ for tool in res.result().json()["tools"]:
+ print(proj_name, tool["name"], SF_BASE_URL + tool["url"], sep="\t")
+ except Exception as err:
+ logging.error(f"cannot list tools for project {proj_name}: {err}")
+
+
+@click.command()
+@click.argument("project_list", type=click.File())
+def main(project_list):
+ logging.basicConfig(level=logging.INFO, filename="sourceforge-tools.log")
+ ls_all_tools([line.rstrip() for line in project_list])
+
+
+if __name__ == "__main__":
+ main()

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 3:30 PM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3214842

Event Timeline