Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9345765
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
4 KB
Subscribers
None
View Options
diff --git a/listers/sourceforge/sourceforge-ls-projects.py b/listers/sourceforge/sourceforge-ls-projects.py
new file mode 100755
index 0000000..ed453cf
--- /dev/null
+++ b/listers/sourceforge/sourceforge-ls-projects.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+
+"""list all SourceForge projects using sitemap(s)
+
+Example:
+
+ $ sourceforge-ls-projects.py | sort > sourceforge-projects.txt
+
+"""
+
+__copyright__ = "Copyright (C) 2020 Stefano Zacchiroli"
+__license__ = "GPL-3.0-or-later"
+
+
+import logging
+import re
+import requests
+import xml.etree.ElementTree as ET
+
+from pathlib import Path
+
+
+SITEMAP_INDEX_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml"
+PROJ_URL_RE = re.compile("^(https://sourceforge.net/p/)([^/]+)")
+CACHE_DIR = Path("~/.cache/swh/sourceforge-lister").expanduser()
+
+
+def download(url, use_cache=True):
+ """URL downloader backed by on-disk cache
+
+ """
+ cache_name = CACHE_DIR / url.split("/")[-1]
+
+ if not use_cache or not cache_name.exists():
+ logging.info(f"downloading {url} ...")
+ r = requests.get(url)
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
+ with open(cache_name, "w") as f:
+ f.write(r.text)
+
+ with open(cache_name) as f:
+ return f.read()
+
+
+def ls_projects(sitemap_url):
+ """(download and) parse sitemaps to extract project URLs
+
+ """
+ idx = ET.fromstring(download(SITEMAP_INDEX_URL))
+
+ for map_loc in idx.findall(".//{*}sitemap/{*}loc"):
+ map_url = map_loc.text
+ sub_idx = ET.fromstring(download(map_url))
+
+ known_projs = set() # hash()-es of known projects
+ for proj in sub_idx.findall(".//{*}url"):
+ proj_url = proj.find("{*}loc").text
+ # proj_ts = proj.find("{*}lastmod").text
+ # proj_freq = proj.find("{*}changefreq").text
+ # print("\t".join([proj_url, proj_ts, proj_freq]))A
+ if m := PROJ_URL_RE.match(proj_url):
+ proj_name = m.group(2) # base project url
+ h = hash(proj_name)
+ if h not in known_projs:
+ known_projs.add(h)
+ yield proj_name
+
+
+def main():
+ logging.basicConfig(level=logging.INFO)
+ for proj_name in ls_projects(SITEMAP_INDEX_URL):
+ print(proj_name)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/listers/sourceforge/sourceforge-ls-tools.py b/listers/sourceforge/sourceforge-ls-tools.py
new file mode 100755
index 0000000..432b540
--- /dev/null
+++ b/listers/sourceforge/sourceforge-ls-tools.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+"""list all SourceForge project "tools", starting from a project list
+
+output format is a list of records, each consisting of TAB-separated fields:
+project_name, tool_name, tool_url
+
+Example:
+
+ $ shuf sourceforge-projects.txt | sourceforge-ls-tools - > sourceforge-tools.csv
+
+"""
+
+__copyright__ = "Copyright (C) 2020 Stefano Zacchiroli"
+__license__ = "GPL-3.0-or-later"
+
+
+import click
+import logging
+
+from concurrent.futures import ThreadPoolExecutor
+from requests_futures.sessions import FuturesSession
+from tqdm import tqdm
+
+
+SF_BASE_URL = "https://sourceforge.net"
+REST_URL_PREFIX = "https://sourceforge.net/rest/p/"
+WORKERS = 8
+
+
+def ls_all_tools(projects):
+ session = FuturesSession(executor=ThreadPoolExecutor(max_workers=WORKERS))
+ responses = []
+
+ for project in projects: # schedule requests
+ rest_url = REST_URL_PREFIX + project
+ responses.append((project, session.get(rest_url)))
+
+ for proj_name, res in tqdm(responses): # extract tools from responses
+ try:
+ for tool in res.result().json()["tools"]:
+ print(proj_name, tool["name"], SF_BASE_URL + tool["url"], sep="\t")
+ except Exception as err:
+ logging.error(f"cannot list tools for project {proj_name}: {err}")
+
+
+@click.command()
+@click.argument("project_list", type=click.File())
+def main(project_list):
+ logging.basicConfig(level=logging.INFO, filename="sourceforge-tools.log")
+ ls_all_tools([line.rstrip() for line in project_list])
+
+
+if __name__ == "__main__":
+ main()
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Fri, Jul 4, 3:30 PM (1 w, 14 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3214842
Attached To
rDSNIP Code snippets
Event Timeline
Log In to Comment