diff --git a/listers/sourceforge/sourceforge-ls-projects.py b/listers/sourceforge/sourceforge-ls-projects.py --- a/listers/sourceforge/sourceforge-ls-projects.py +++ b/listers/sourceforge/sourceforge-ls-projects.py @@ -21,7 +21,7 @@ SITEMAP_INDEX_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml" -PROJ_URL_RE = re.compile("^(https://sourceforge.net/p/)([^/]+)") +PROJ_URL_RE = re.compile("^https://sourceforge.net/([^/]+)/([^/]+)/(.*)") CACHE_DIR = Path("~/.cache/swh/sourceforge-lister").expanduser() @@ -56,11 +56,20 @@ for proj in sub_idx.findall(".//{*}url"): proj_url = proj.find("{*}loc").text if m := PROJ_URL_RE.match(proj_url): + namespace = m.group(1) + if namespace == "projects": + # These have a `/p/` counterparts + continue proj_name = m.group(2) # base project url + rest = m.group(3) + if rest.count("/") > 1: + # This is a subproject + proj_name = f"{proj_name}/{rest.rsplit('/', 2)[0]}" + h = hash(proj_name) if h not in known_projs: known_projs.add(h) - yield proj_name + yield f"{namespace}/{proj_name}" def main(): diff --git a/listers/sourceforge/sourceforge-ls-tools.py b/listers/sourceforge/sourceforge-ls-tools.py --- a/listers/sourceforge/sourceforge-ls-tools.py +++ b/listers/sourceforge/sourceforge-ls-tools.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 -"""list all SourceForge project "tools", starting from a project list +"""list all SourceForge project "tools", starting from a namespaced project list output format is a list of records, each consisting of TAB-separated fields: -project_name, tool_name, tool_url +namespaced_project_name, tool_name, tool_url Example: @@ -24,7 +24,7 @@ SF_BASE_URL = "https://sourceforge.net" -REST_URL_PREFIX = "https://sourceforge.net/rest/p/" +REST_URL_PREFIX = "https://sourceforge.net/rest/" WORKERS = 8 @@ -32,9 +32,9 @@ session = FuturesSession(executor=ThreadPoolExecutor(max_workers=WORKERS)) responses = [] - for project in projects: # schedule requests - rest_url = REST_URL_PREFIX + project - responses.append((project, session.get(rest_url))) + for namespaced_project in projects: # schedule requests + rest_url = REST_URL_PREFIX + namespaced_project + responses.append((namespaced_project, session.get(rest_url))) for proj_name, res in tqdm(responses): # extract tools from responses try: