diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -22,6 +22,9 @@ [mypy-lazr.*] ignore_missing_imports = True +[mypy-lxml.*] +ignore_missing_imports = True + [mypy-pkg_resources.*] ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ launchpadlib tenacity >= 6.2 xmltodict +lxml diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -1,7 +1,8 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + from dataclasses import dataclass, field import datetime from enum import Enum @@ -12,6 +13,7 @@ from bs4 import BeautifulSoup import iso8601 +import lxml import requests from tenacity.before_sleep import before_sleep_log @@ -172,7 +174,7 @@ r".*\.code\.sf\.net/(?P<namespace>[^/]+)/(?P<project>.+)/.*" ) bzr_url_match = re.compile( - r"http://(?P<project>[^/]+).bzr.sourceforge.net/bzrroot/([^/]+)" + r"http://(?P<project>[^/]+).bzr.sourceforge.net/bzr/([^/]+)" ) cvs_url_match = re.compile( r"rsync://a.cvs.sourceforge.net/cvsroot/(?P<project>.+)/([^/]+)" @@ -410,7 +412,37 @@ # SourceForge has removed support for bzr and only keeps legacy projects # around at a separate (also not https) URL. Bzr projects are very rare # and a lot of them are 404 now. - url = f"http://{project}.bzr.sourceforge.net/bzrroot/{project}" + url = f"http://{project}.bzr.sourceforge.net/bzr/{project}" + try: + response = self.page_request(url, params={}) + if "To get this branch, use:" not in response.text: + # If a bzr project has multiple branches, we need to extract their + # names from the repository landing page and create one listed origin + # per branch + parser = lxml.etree.HTMLParser() + tree = lxml.etree.fromstring(response.text, parser) + + # Get all tds with class 'autcell' + tds = tree.xpath(".//td[contains(@class, 'autcell')]") + for td in tds: + branch = td.findtext("a") + # If the td's parent contains <img alt="Branch"/> and + # it has non-empty text: + if td.xpath("..//img[@alt='Branch']") and branch: + hits.append( + SourceForgeListerEntry( + vcs=VcsNames(tool_name), + url=f"{url}/{branch}", + last_modified=last_modified, + ) + ) + continue + except requests.HTTPError: + logger.warning( + "Bazaar repository page could not be fetched, skipping project '%s'", + project, + ) + continue entry = SourceForgeListerEntry( vcs=VcsNames(tool_name), url=url, last_modified=last_modified ) diff --git a/swh/lister/sourceforge/tests/data/bzr-repo.json b/swh/lister/sourceforge/tests/data/bzr-repo.json deleted file mode 100644 --- a/swh/lister/sourceforge/tests/data/bzr-repo.json +++ /dev/null @@ -1,53 +0,0 @@ -{ - "shortname": "bzr-repo", - "name": "Bazaar repo", - "_id": "4bf3fc291be1ce2f10000052", - "url": "https://sourceforge.net/p/bzr-repo/", - "private": false, - "short_description": "This is an example bzr project", - "creation_date": "2009-10-10", - "summary": "", - "external_homepage": "", - "video_url": "", - "socialnetworks": [], - "status": "active", - "moved_to_url": "", - "preferred_support_tool": "", - "preferred_support_url": "", - "developers": [ - { - "username": "Alphare", - "name": "Raphaël Gomès", - "url": "https://sourceforge.net/u/alphare/" - } - ], - "tools": [ - { - "name": "bzr", - "mount_point": "bzr", - "url": "/p/bzr-repo/bazaar/", - "icons": { - "24": "images/code_24.png", - "32": "images/code_32.png", - "48": "images/code_48.png" - }, - "installable": true, - "tool_label": "Bazaar", - "mount_label": "Bazaar" - } - ], - "labels": [], - "categories": { - "audience": [], - "developmentstatus": [], - "environment": [], - "language": [], - "license": [], - "translation": [], - "os": [], - "database": [], - "topic": [] - }, - "icon_url": null, - "screenshots": [] - } diff --git a/swh/lister/sourceforge/tests/data/ocaml-lpd.html b/swh/lister/sourceforge/tests/data/ocaml-lpd.html new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/ocaml-lpd.html @@ -0,0 +1,106 @@ +<?xml version="1.0"?> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head> +<title>SourceForge: Browsing /ocaml-lpd</title> +<link href="/bzr/static/css/global.css" rel="stylesheet" /> +<meta content="text/html; charset=utf-8" http-equiv="content-type" /> +<meta content="The world's largest development and download repository of Open Source code and applications" name="description" /> +<meta content="Open Source, Development, Developers, Projects, Downloads, OSTG, VA Software, SF.net, SourceForge" name="keywords" /> +<link href="/bzr/static/css/sf.css" rel="stylesheet" /> + +</head> +<body> +<div id="doc3" class="yui-t6 login"> +<div id="hd"> +<div class="yui-gf"> +<div class="yui-u first"> +<h1> +<a href="https://sourceforge.net/" title=""> +SourceForge.net +</a> +</h1> +<ul class="jump"> +<li> +<a href="#content">Jump to main content</a> +</li> +</ul> +</div> +<div class="yui-u"> +<a href="https://sourceforge.net/support" title="Get help and support on SourceForge.net">Help +</a> +</div> +</div> +</div> +</div> +<div id="loggerheadCont"> +<h1> +Browsing +<span class="breadcrumb"> +<a href="/bzr/">(root)</a><span>/</span><a href="/bzr/ocaml-lpd">ocaml-lpd</a> +</span> +</h1> +<div> +<table id="logentries"> +<tr class="logheader"> +<th colspan="2" class="summarycell">Filename</th> +<th class="datecell">Latest Rev</th> +<th class="datecell">Last Changed</th> +</tr> +<tr class="blueRow0"> +<td class="icocell"> +<a href="../"><img src="/bzr/static/images/ico_folder_up.gif" /></a> +</td> +<td colspan="3" class="summcell"> +<a href="../">..</a> +</td> +</tr> + + + +<tr class="blueRow0"> +<td class="icocell"> +<a href="backup.bzr.~1~/"> +<img src="/bzr/static/images/ico_folder.gif" alt="Folder" /> +</a> +</td> +<td class="autcell"> +<a href="backup.bzr.~1~/">backup.bzr.~1~</a></td> +<td class="date"></td> +<td class="date"></td> +</tr> + + + +<tr class="blueRow1"> +<td class="icocell"> +<a href="trunk/files"> +<img src="/bzr/static/images/ico_branch.gif" alt="Branch" /> +</a> +</td> +<td class="autcell"> +<a href="trunk/files">trunk</a></td> +<td class="date"> +<a href="trunk/revision/13" title="Show revision 13">13</a> +</td> +<td class="date">2011-04-17 22:02:29</td> +</tr> + + + +</table> +</div> +</div> +<hr /> +<div id="ft"> +<div class="yui-g divider"> +<div class="yui-u first copyright"> +©Copyright 2017 - +Slashdot Media. All Rights Reserved +</div> +<div class="yui-u"> +<a href="https://sourceforge.net/support">Help</a> +</div> +</div> +</div> +</body> +</html> \ No newline at end of file diff --git a/swh/lister/sourceforge/tests/data/ocaml-lpd.json b/swh/lister/sourceforge/tests/data/ocaml-lpd.json new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/ocaml-lpd.json @@ -0,0 +1,201 @@ +{ + "shortname": "ocaml-lpd", + "name": "Lpd OCaml library", + "_id": "50c63c70e88f3d0bf07d4c6d", + "url": "https://sourceforge.net/p/ocaml-lpd/", + "private": false, + "short_description": "OCaml Lpd is a Line Printer Daemon (LPD) server library written in OCaml. This project moved to OCamlForge https://forge.ocamlcore.org/projects/lpd/", + "creation_date": "2005-02-23", + "summary": "", + "external_homepage": "http://lpd.forge.ocamlcore.org/", + "video_url": "", + "socialnetworks": [], + "status": "moved", + "moved_to_url": "https://forge.ocamlcore.org/projects/lpd/", + "preferred_support_tool": "", + "preferred_support_url": "", + "developers": [ + { + "username": "chris_77", + "name": "ChriS", + "url": "https://sourceforge.net/u/chris_77/" + } + ], + "tools": [ + { + "name": "files-sf", + "mount_point": "files", + "url": "/p/ocaml-lpd/files/", + "icons": { + "24": "images/downloads_24.png", + "32": "images/downloads_32.png", + "48": "images/downloads_48.png" + }, + "installable": false, + "tool_label": "Files", + "mount_label": "Files" + }, + { + "name": "mailman", + "mount_point": "mailman", + "url": "/p/ocaml-lpd/mailman/", + "icons": { + "24": "images/forums_24.png", + "32": "images/forums_32.png", + "48": "images/forums_48.png" + }, + "installable": false, + "tool_label": "Mailing Lists", + "mount_label": "Mailing Lists" + }, + { + "name": "bzr", + "mount_point": "code", + "url": "/p/ocaml-lpd/code/", + "icons": { + "24": "images/code_24.png", + "32": "images/code_32.png", + "48": "images/code_48.png" + }, + "installable": false, + "tool_label": "BZR", + "mount_label": "Code" + }, + { + "name": "summary", + "mount_point": "summary", + "url": "/p/ocaml-lpd/summary/", + "icons": { + "24": "images/sftheme/24x24/blog_24.png", + "32": "images/sftheme/32x32/blog_32.png", + "48": "images/sftheme/48x48/blog_48.png" + }, + "installable": false, + "tool_label": "Summary", + "mount_label": "Summary", + "sourceforge_group_id": 132212 + }, + { + "name": "wiki", + "mount_point": "wiki", + "url": "/p/ocaml-lpd/wiki/", + "icons": { + "24": "images/wiki_24.png", + "32": "images/wiki_32.png", + "48": "images/wiki_48.png" + }, + "installable": true, + "tool_label": "Wiki", + "mount_label": "Wiki" + }, + { + "name": "reviews", + "mount_point": "reviews", + "url": "/p/ocaml-lpd/reviews/", + "icons": { + "24": "images/sftheme/24x24/blog_24.png", + "32": "images/sftheme/32x32/blog_32.png", + "48": "images/sftheme/48x48/blog_48.png" + }, + "installable": false, + "tool_label": "Reviews", + "mount_label": "Reviews" + }, + { + "name": "support", + "mount_point": "support", + "url": "/p/ocaml-lpd/support/", + "icons": { + "24": "images/sftheme/24x24/blog_24.png", + "32": "images/sftheme/32x32/blog_32.png", + "48": "images/sftheme/48x48/blog_48.png" + }, + "installable": false, + "tool_label": "Support", + "mount_label": "Support" + }, + { + "name": "activity", + "mount_point": "activity", + "url": "/p/ocaml-lpd/activity/", + "icons": { + "24": "images/admin_24.png", + "32": "images/admin_32.png", + "48": "images/admin_48.png" + }, + "installable": false, + "tool_label": "Tool", + "mount_label": "Activity" + } + ], + "labels": [], + "categories": { + "audience": [ + { + "id": 3, + "shortname": "developers", + "fullname": "Developers", + "fullpath": "Intended Audience :: by End-User Class :: Developers" + } + ], + "developmentstatus": [ + { + "id": 11, + "shortname": "production", + "fullname": "5 - Production/Stable", + "fullpath": "Development Status :: 5 - Production/Stable" + } + ], + "environment": [ + { + "id": 238, + "shortname": "daemon", + "fullname": "Non-interactive (Daemon)", + "fullpath": "User Interface :: Non-interactive (Daemon)" + } + ], + "language": [ + { + "id": 454, + "shortname": "ocaml", + "fullname": "OCaml (Objective Caml)", + "fullpath": "Programming Language :: OCaml (Objective Caml)" + } + ], + "license": [ + { + "id": 16, + "shortname": "lgpl", + "fullname": "GNU Library or Lesser General Public License version 2.0 (LGPLv2)", + "fullpath": "License :: OSI-Approved Open Source :: GNU Library or Lesser General Public License version 2.0 (LGPLv2)" + } + ], + "translation": [ + { + "id": 275, + "shortname": "english", + "fullname": "English", + "fullpath": "Translations :: English" + } + ], + "os": [ + { + "id": 436, + "shortname": "os_portable", + "fullname": "OS Portable (Source code to work with many OS platforms)", + "fullpath": "Operating System :: Grouping and Descriptive Categories :: OS Portable (Source code to work with many OS platforms)" + } + ], + "database": [], + "topic": [ + { + "id": 154, + "shortname": "printing", + "fullname": "Printing", + "fullpath": "Topic :: Printing" + } + ] + }, + "icon_url": null, + "screenshots": [] +} \ No newline at end of file diff --git a/swh/lister/sourceforge/tests/data/subsitemap-1.xml b/swh/lister/sourceforge/tests/data/subsitemap-1.xml --- a/swh/lister/sourceforge/tests/data/subsitemap-1.xml +++ b/swh/lister/sourceforge/tests/data/subsitemap-1.xml @@ -41,8 +41,13 @@ <changefreq>daily</changefreq> </url> <url> - <loc>https://sourceforge.net/p/bzr-repo/</loc> - <lastmod>2021-01-27</lastmod> + <loc>https://sourceforge.net/p/t12eksandbox/</loc> + <lastmod>2011-02-09</lastmod> + <changefreq>daily</changefreq> +</url> +<url> + <loc>https://sourceforge.net/p/ocaml-lpd/</loc> + <lastmod>2011-04-17</lastmod> <changefreq>daily</changefreq> </url> </urlset> diff --git a/swh/lister/sourceforge/tests/data/t12eksandbox.html b/swh/lister/sourceforge/tests/data/t12eksandbox.html new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/t12eksandbox.html @@ -0,0 +1,274 @@ +<?xml version="1.0"?> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head> +<meta content="text/html; charset=utf-8" http-equiv="Content-Type" /> +<meta content="Loggerhead/1.18.1 Python/2.7.5 Bazaar/2.5.1 Paste/1.7.5.1 PasteDeploy/1.5.0 SimpleTAL/4.3 simplejson/3.10.0" name="generator" /> +<title>/t12eksandbox : changes</title> + +<meta content="text/html; charset=utf-8" http-equiv="content-type" /> +<meta content="The world's largest development and download repository of Open Source code and applications" name="description" /> +<meta content="Open Source, Development, Developers, Projects, Downloads, OSTG, VA Software, SF.net, SourceForge" name="keywords" /> +<link href="/bzr/static/css/sf.css" rel="stylesheet" /> +</head> +<body> +<div id="doc3" class="yui-t6 login"> +<div id="hd"> +<div class="yui-gf"> +<div class="yui-u first"> +<h1> +<a href="https://sourceforge.net/" title=""> +SourceForge.net +</a> +</h1> +<ul class="jump"> +<li> +<a href="#content">Jump to main content</a> +</li> +</ul> +</div> +<div class="yui-u"> +<a href="https://sourceforge.net/support" title="Get help and support on SourceForge.net">Help +</a> +</div> +</div> +</div> +</div> + +<div id="finderBox"> + +<form action="/bzr/t12eksandbox/changes?start_revid=ctsai%40users.sourceforge.net-20110209191348-zkzbkuypzq1vncx9"> +<label>search:</label> +<input autocomplete="off" onblur="hide_search();" type="search" name="q" id="q" /> +</form> + +<div> + +<a href="/bzr/t12eksandbox/atom" title="RSS feed for /t12eksandbox"> +<img src="/bzr/static/images/ico_rss.gif" alt="RSS" class="rssfeed" /> +</a> + +</div> +</div> + +<ul id="menuTabs"> + +<li><a href="/bzr/t12eksandbox/changes" title="Changes" id="on">Changes</a></li> +<li><a href="/bzr/t12eksandbox/files" title="Files">Files</a></li> + + +</ul> + +<div id="loggerheadCont"> +<div id="search_terms"></div> + +<h1> + + +<span class="breadcrumb"> +<a href="/bzr/">(root)</a><span>/</span><a href="/bzr/t12eksandbox">t12eksandbox</a> +</span> + + +: changes + +from revision +<span>4</span> + + + +</h1> + +<div> +<div id="branch-info"> +To get this branch, use: <br /> +<code>bzr branch +http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox</code> +</div> + + + +<p class="fr revnolink">From Revision <a>4</a> +</p> +<p class="expand show_if_js" id="expand_all"><a href="#"> +<img src="/bzr/static/images/treeCollapsed.png" alt="expand all" /> expand all</a> +</p> +<p style="display:none;" class="expand" id="collapse_all"><a href="#"> +<img src="/bzr/static/images/treeExpanded.png" alt="collapse all" /> collapse all</a> +</p> + +<table id="logentries"> +<tr class="logheader"> +<td class="revisionnumber">Rev</td> +<td class="expandcell show_if_js"> </td> +<td class="summarycell">Summary</td> +<td class="authorcell">Authors</td> + +<td class="datecell">Date</td> +<td class="diffcell">Diff</td> +<td class="downloadcell">Files</td> +</tr> + +<a name="entry-4" /> +<tr class="blueRow0 revision_log" id="log-0"> +<td class="revnro revnolink"><a title="Show revision 4" href="/bzr/t12eksandbox/revision/4">4</a> +</td> +<td class="expcell show_if_js"> +<div class="expand_revisioninfo"> +<a href="#"> +<img src="/bzr/static/images/treeCollapsed.png" class="expand_icon" /> +</a> +</div> +</td> +<td class="summcell"> +<div class="short_description"> + +<a title="Show revision 4" href="/bzr/t12eksandbox/revision/4" class="link">Commit!</a> +</div> +<div style="display: none" class="long_description"> + +<a title="Show revision 4" href="/bzr/t12eksandbox/revision/4" class="link">Commit!<br/></a> +<div class="loading"> +<img src="/bzr/static/images/spinner.gif" /> +</div> +</div> +</td> +<td class="autcell">ctsai at sourceforge</td> + +<td class="date"> +<span title="2011-02-09 19:13:48">2011-02-09</span> +</td> +<td class="diffr"><a title="Show diff at revision 4" href="/bzr/t12eksandbox/revision/4"> +<img src="/bzr/static/images/ico_diff.gif" alt="Diff" /></a></td> +<td class="downr"><a href="/bzr/t12eksandbox/files/4" title="Files at revision 4"> +<img src="/bzr/static/images/ico_file.gif" alt="Files" /></a> +</td> +</tr> + +<a name="entry-3" /> +<tr class="blueRow1 revision_log" id="log-1"> +<td class="revnro revnolink"><a title="Show revision 3" href="/bzr/t12eksandbox/revision/3">3</a> +</td> +<td class="expcell show_if_js"> +<div class="expand_revisioninfo"> +<a href="#"> +<img src="/bzr/static/images/treeCollapsed.png" class="expand_icon" /> +</a> +</div> +</td> +<td class="summcell"> +<div class="short_description"> + +<a title="Show revision 3" href="/bzr/t12eksandbox/revision/3" class="link">fdsa</a> +</div> +<div style="display: none" class="long_description"> + +<a title="Show revision 3" href="/bzr/t12eksandbox/revision/3" class="link">fdsa<br/></a> +<div class="loading"> +<img src="/bzr/static/images/spinner.gif" /> +</div> +</div> +</td> +<td class="autcell">ctsai at sourceforge</td> + +<td class="date"> +<span title="2010-02-03 17:12:10">2010-02-03</span> +</td> +<td class="diffr"><a title="Show diff at revision 3" href="/bzr/t12eksandbox/revision/3"> +<img src="/bzr/static/images/ico_diff.gif" alt="Diff" /></a></td> +<td class="downr"><a href="/bzr/t12eksandbox/files/3" title="Files at revision 3"> +<img src="/bzr/static/images/ico_file.gif" alt="Files" /></a> +</td> +</tr> + +<a name="entry-2" /> +<tr class="blueRow0 revision_log" id="log-2"> +<td class="revnro revnolink"><a title="Show revision 2" href="/bzr/t12eksandbox/revision/2">2</a> +</td> +<td class="expcell show_if_js"> +<div class="expand_revisioninfo"> +<a href="#"> +<img src="/bzr/static/images/treeCollapsed.png" class="expand_icon" /> +</a> +</div> +</td> +<td class="summcell"> +<div class="short_description"> + +<a title="Show revision 2" href="/bzr/t12eksandbox/revision/2" class="link">fdsa</a> +</div> +<div style="display: none" class="long_description"> + +<a title="Show revision 2" href="/bzr/t12eksandbox/revision/2" class="link">fdsa<br/></a> +<div class="loading"> +<img src="/bzr/static/images/spinner.gif" /> +</div> +</div> +</td> +<td class="autcell">ctsai at sourceforge</td> + +<td class="date"> +<span title="2009-10-12 15:01:55">2009-10-12</span> +</td> +<td class="diffr"><a title="Show diff at revision 2" href="/bzr/t12eksandbox/revision/2"> +<img src="/bzr/static/images/ico_diff.gif" alt="Diff" /></a></td> +<td class="downr"><a href="/bzr/t12eksandbox/files/2" title="Files at revision 2"> +<img src="/bzr/static/images/ico_file.gif" alt="Files" /></a> +</td> +</tr> + +<a name="entry-1" /> +<tr class="blueRow1 revision_log" id="log-3"> +<td class="revnro revnolink"><a title="Show revision 1" href="/bzr/t12eksandbox/revision/1">1</a> +</td> +<td class="expcell show_if_js"> +<div class="expand_revisioninfo"> +<a href="#"> +<img src="/bzr/static/images/treeCollapsed.png" class="expand_icon" /> +</a> +</div> +</td> +<td class="summcell"> +<div class="short_description"> + +<a title="Show revision 1" href="/bzr/t12eksandbox/revision/1" class="link">Commit to test</a> +</div> +<div style="display: none" class="long_description"> + +<a title="Show revision 1" href="/bzr/t12eksandbox/revision/1" class="link">Commit to test<br/></a> +<div class="loading"> +<img src="/bzr/static/images/spinner.gif" /> +</div> +</div> +</td> +<td class="autcell">ctsai at sourceforge</td> + +<td class="date"> +<span title="2009-10-12 14:42:27">2009-10-12</span> +</td> +<td class="diffr"><a title="Show diff at revision 1" href="/bzr/t12eksandbox/revision/1"> +<img src="/bzr/static/images/ico_diff.gif" alt="Diff" /></a></td> +<td class="downr"><a href="/bzr/t12eksandbox/files/1" title="Files at revision 1"> +<img src="/bzr/static/images/ico_file.gif" alt="Files" /></a> +</td> +</tr> + +</table> + +</div> +<p class="fl">Loggerhead 1.18.1 is a web-based interface for <a href="http://bazaar-vcs.org/">Bazaar</a> branches</p> +</div> +<br /> +<hr /> +<div id="ft"> +<div class="yui-g divider"> +<div class="yui-u first copyright"> +©Copyright 2017 - +Slashdot Media. All Rights Reserved +</div> +<div class="yui-u"> +<a href="https://sourceforge.net/support">Help</a> +</div> +</div> +</div> +</body> +</html> \ No newline at end of file diff --git a/swh/lister/sourceforge/tests/data/t12eksandbox.json b/swh/lister/sourceforge/tests/data/t12eksandbox.json new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/t12eksandbox.json @@ -0,0 +1,292 @@ +{ + "shortname": "t12eksandbox", + "name": "t12ek sandbox", + "_id": "5304cd2634309d109fc1dec5", + "url": "https://sourceforge.net/p/t12eksandbox/", + "private": false, + "short_description": "Sandboxes are for playing in... Note: this is an SF.net staff's test project. Don't expect to find real files here. Update test!\r\nLine 2!\r\nupdate 2012-06-05", + "creation_date": "2009-07-14", + "summary": "", + "external_homepage": "http://t12eksandbox.sourceforge.net", + "video_url": "", + "socialnetworks": [], + "status": "active", + "moved_to_url": "", + "preferred_support_tool": "_url", + "preferred_support_url": "http://sourceforge.net/tracker/?func=add&group_id=269579&atid=1146768", + "developers": [ + { + "username": "sillygoose", + "name": "sillygoose", + "url": "https://sourceforge.net/u/sillygoose/" + }, + { + "username": "thimsmith", + "name": "Tim Siegel", + "url": "https://sourceforge.net/u/thimsmith/" + } + ], + "tools": [ + { + "name": "reviews", + "mount_point": "reviews", + "url": "/p/t12eksandbox/reviews/", + "icons": { + "24": "images/sftheme/24x24/blog_24.png", + "32": "images/sftheme/32x32/blog_32.png", + "48": "images/sftheme/48x48/blog_48.png" + }, + "installable": false, + "tool_label": "Reviews", + "mount_label": "Reviews" + }, + { + "name": "summary", + "mount_point": "summary", + "url": "/p/t12eksandbox/summary/", + "icons": { + "24": "images/sftheme/24x24/blog_24.png", + "32": "images/sftheme/32x32/blog_32.png", + "48": "images/sftheme/48x48/blog_48.png" + }, + "installable": false, + "tool_label": "Summary", + "mount_label": "Summary", + "sourceforge_group_id": 269579 + }, + { + "name": "mailman", + "mount_point": "mailman", + "url": "/p/t12eksandbox/mailman/", + "icons": { + "24": "images/forums_24.png", + "32": "images/forums_32.png", + "48": "images/forums_48.png" + }, + "installable": false, + "tool_label": "Mailing Lists", + "mount_label": "Mailing Lists" + }, + { + "name": "support", + "mount_point": "support", + "url": "/p/t12eksandbox/support/", + "icons": { + "24": "images/sftheme/24x24/blog_24.png", + "32": "images/sftheme/32x32/blog_32.png", + "48": "images/sftheme/48x48/blog_48.png" + }, + "installable": false, + "tool_label": "Support", + "mount_label": "Support" + }, + { + "name": "files-sf", + "mount_point": "files", + "url": "/p/t12eksandbox/files/", + "icons": { + "24": "images/downloads_24.png", + "32": "images/downloads_32.png", + "48": "images/downloads_48.png" + }, + "installable": false, + "tool_label": "Files", + "mount_label": "Files" + }, + { + "name": "wiki", + "mount_point": "wiki", + "url": "/p/t12eksandbox/wiki/", + "icons": { + "24": "images/wiki_24.png", + "32": "images/wiki_32.png", + "48": "images/wiki_48.png" + }, + "installable": true, + "tool_label": "Wiki", + "mount_label": "Wiki" + }, + { + "name": "blog", + "mount_point": "news", + "url": "/p/t12eksandbox/news/", + "icons": { + "24": "images/blog_24.png", + "32": "images/blog_32.png", + "48": "images/blog_48.png" + }, + "installable": true, + "tool_label": "Blog", + "mount_label": "News" + }, + { + "name": "bzr", + "mount_point": "bazaar", + "url": "/p/t12eksandbox/bazaar/", + "icons": { + "24": "images/code_24.png", + "32": "images/code_32.png", + "48": "images/code_48.png" + }, + "installable": false, + "tool_label": "BZR", + "mount_label": "Bazaar" + }, + { + "name": "discussion", + "mount_point": "discussion", + "url": "/p/t12eksandbox/discussion/", + "icons": { + "24": "images/forums_24.png", + "32": "images/forums_32.png", + "48": "images/forums_48.png" + }, + "installable": true, + "tool_label": "Discussion", + "mount_label": "Discussion" + }, + { + "name": "tickets", + "mount_point": "support-requests", + "url": "/p/t12eksandbox/support-requests/", + "icons": { + "24": "images/tickets_24.png", + "32": "images/tickets_32.png", + "48": "images/tickets_48.png" + }, + "installable": true, + "tool_label": "Tickets", + "mount_label": "Support Requests" + }, + { + "name": "tickets", + "mount_point": "feature-requests", + "url": "/p/t12eksandbox/feature-requests/", + "icons": { + "24": "images/tickets_24.png", + "32": "images/tickets_32.png", + "48": "images/tickets_48.png" + }, + "installable": true, + "tool_label": "Tickets", + "mount_label": "Feature Requests" + }, + { + "name": "link", + "mount_point": "donate", + "url": "/p/t12eksandbox/donate/", + "icons": { + "24": "images/ext_24.png", + "32": "images/ext_32.png", + "48": "images/ext_48.png" + }, + "installable": true, + "tool_label": "External Link", + "mount_label": "Donate" + }, + { + "name": "tickets", + "mount_point": "patches", + "url": "/p/t12eksandbox/patches/", + "icons": { + "24": "images/tickets_24.png", + "32": "images/tickets_32.png", + "48": "images/tickets_48.png" + }, + "installable": true, + "tool_label": "Tickets", + "mount_label": "Patches" + }, + { + "name": "tickets", + "mount_point": "bugs", + "url": "/p/t12eksandbox/bugs/", + "icons": { + "24": "images/tickets_24.png", + "32": "images/tickets_32.png", + "48": "images/tickets_48.png" + }, + "installable": true, + "tool_label": "Tickets", + "mount_label": "Bugs" + }, + { + "name": "activity", + "mount_point": "activity", + "url": "/p/t12eksandbox/activity/", + "icons": { + "24": "images/admin_24.png", + "32": "images/admin_32.png", + "48": "images/admin_48.png" + }, + "installable": false, + "tool_label": "Tool", + "mount_label": "Activity" + } + ], + "labels": [], + "categories": { + "audience": [], + "developmentstatus": [ + { + "id": 10, + "shortname": "beta", + "fullname": "4 - Beta", + "fullpath": "Development Status :: 4 - Beta" + }, + { + "id": 7, + "shortname": "planning", + "fullname": "1 - Planning", + "fullpath": "Development Status :: 1 - Planning" + } + ], + "environment": [], + "language": [], + "license": [ + { + "id": 196, + "shortname": "other", + "fullname": "Other License", + "fullpath": "License :: Other License" + } + ], + "translation": [], + "os": [], + "database": [ + { + "id": 524, + "shortname": "db_net_mysql", + "fullname": "MySQL", + "fullpath": "Database Environment :: Network-based DBMS :: MySQL" + } + ], + "topic": [ + { + "id": 575, + "shortname": "testing", + "fullname": "Testing", + "fullpath": "Topic :: Software Development :: Testing" + }, + { + "id": 97, + "shortname": "scientific", + "fullname": "Scientific/Engineering", + "fullpath": "Topic :: Scientific/Engineering" + } + ] + }, + "icon_url": null, + "screenshots": [ + { + "url": "https://sourceforge.net/p/t12eksandbox/screenshot/224498.jpg", + "thumbnail_url": "https://sourceforge.net/p/t12eksandbox/screenshot/224498.jpg/thumb", + "caption": "aimage2" + }, + { + "url": "https://sourceforge.net/p/t12eksandbox/screenshot/224496.jpg", + "thumbnail_url": "https://sourceforge.net/p/t12eksandbox/screenshot/224496.jpg/thumb", + "caption": "3Kimage3" + } + ] +} \ No newline at end of file diff --git a/swh/lister/sourceforge/tests/test_lister.py b/swh/lister/sourceforge/tests/test_lister.py --- a/swh/lister/sourceforge/tests/test_lister.py +++ b/swh/lister/sourceforge/tests/test_lister.py @@ -35,6 +35,8 @@ "mramm": "p", "os3dmodels": "p", "random-mercurial": "p", + "t12eksandbox": "p", + "ocaml-lpd": "p", } URLS_MATCHER = { @@ -67,6 +69,10 @@ return Path(datadir, "aaron.html").read_text() +def get_bzr_repo_page(datadir, repo_name): + return Path(datadir, f"{repo_name}.html").read_text() + + def _check_request_headers(request): return request.headers.get("User-Agent") == USER_AGENT @@ -85,7 +91,14 @@ "https://git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"), "https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"), "http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"), - "http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"), + "http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox": ( + "bzr", + "2011-02-09", + ), + "http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd/trunk": ( + "bzr", + "2011-04-17", + ), "rsync://a.cvs.sourceforge.net/cvsroot/aaron/aaron": ("cvs", "2013-03-07"), "rsync://a.cvs.sourceforge.net/cvsroot/aaron/www": ("cvs", "2013-03-07"), } @@ -126,6 +139,16 @@ text=get_cvs_info_page(datadir), additional_matcher=_check_request_headers, ) + requests_mock.get( + re.compile("http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox"), + text=get_bzr_repo_page(datadir, "t12eksandbox"), + additional_matcher=_check_request_headers, + ) + requests_mock.get( + re.compile("http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd"), + text=get_bzr_repo_page(datadir, "ocaml-lpd"), + additional_matcher=_check_request_headers, + ) stats = lister.run() # - os3dmodels (2 repos), @@ -133,10 +156,11 @@ # - mojunk (3 repos), # - backapps/website (1 repo), # - random-mercurial (1 repo). - # - bzr-repo (1 repo). + # - t12eksandbox (1 repo). + # - ocaml-lpd (1 repo). # adobe and backapps itself have no repos. - assert stats.pages == 7 - assert stats.origins == 13 + assert stats.pages == 8 + assert stats.origins == 14 expected_state = { "subsitemap_last_modified": { "https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18", @@ -196,6 +220,18 @@ additional_matcher=_check_request_headers, ) + requests_mock.get( + re.compile("http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox"), + text=get_bzr_repo_page(datadir, "t12eksandbox"), + additional_matcher=_check_request_headers, + ) + + requests_mock.get( + re.compile("http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd"), + text=get_bzr_repo_page(datadir, "ocaml-lpd"), + additional_matcher=_check_request_headers, + ) + faked_listed_origins = [ # mramm: changed ListedOrigin( @@ -263,8 +299,14 @@ ListedOrigin( lister_id=lister.lister_obj.id, visit_type="bzr", - url="http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo", - last_update=iso8601.parse_date("2021-01-27"), + url="http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox", + last_update=iso8601.parse_date("2011-02-09"), + ), + ListedOrigin( + lister_id=lister.lister_obj.id, + visit_type="bzr", + url="http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd/trunk", + last_update=iso8601.parse_date("2011-04-17"), ), ListedOrigin( lister_id=lister.lister_obj.id, @@ -359,16 +401,29 @@ additional_matcher=_check_request_headers, ) + requests_mock.get( + re.compile("http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox"), + text=get_bzr_repo_page(datadir, "t12eksandbox"), + additional_matcher=_check_request_headers, + ) + + requests_mock.get( + re.compile("http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd"), + text=get_bzr_repo_page(datadir, "ocaml-lpd"), + additional_matcher=_check_request_headers, + ) + stats = lister.run() # - os3dmodels (2 repos), # - mramm (3 repos), # - mojunk (3 repos), # - backapps/website (1 repo), # - random-mercurial (1 repo). - # - bzr-repo (1 repo). + # - t12eksandbox (1 repo). + # - ocaml-lpd (1 repo). # adobe and backapps itself have no repos. - assert stats.pages == 7 - assert stats.origins == 13 + assert stats.pages == 8 + assert stats.origins == 14 _check_listed_origins(lister, swh_scheduler) @@ -426,6 +481,16 @@ json=functools.partial(get_project_json, datadir), additional_matcher=_check_request_headers, ) + requests_mock.get( + re.compile("http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox"), + text=get_bzr_repo_page(datadir, "t12eksandbox"), + additional_matcher=_check_request_headers, + ) + requests_mock.get( + re.compile("http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd"), + text=get_bzr_repo_page(datadir, "ocaml-lpd"), + additional_matcher=_check_request_headers, + ) # Make all `mramm` requests fail # `mramm` is in subsitemap 0, which ensures we keep listing after an error. requests_mock.get( @@ -442,11 +507,12 @@ # - mojunk (3 repos), # - backapps/website (1 repo), # - random-mercurial (1 repo). - # - bzr-repo (1 repo). + # - t12eksandbox (1 repo). + # - ocaml-lpd (1 repo). # adobe and backapps itself have no repos. # Did *not* list mramm - assert stats.pages == 5 - assert stats.origins == 8 + assert stats.pages == 6 + assert stats.origins == 9 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins} @@ -459,5 +525,12 @@ "https://git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"), "https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"), "http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"), - "http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"), + "http://t12eksandbox.bzr.sourceforge.net/bzr/t12eksandbox": ( + "bzr", + "2011-02-09", + ), + "http://ocaml-lpd.bzr.sourceforge.net/bzr/ocaml-lpd/trunk": ( + "bzr", + "2011-04-17", + ), }