diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -10,6 +10,7 @@ from typing import Any, Dict, Iterator, List, Optional, Set, Tuple from xml.etree import ElementTree +from bs4 import BeautifulSoup import iso8601 import requests from tenacity.before_sleep import before_sleep_log @@ -360,6 +361,35 @@ tool_name = tool["name"] if tool_name not in VCS_NAMES: continue + if tool_name == VcsNames.CVS.value: + # CVS projects are different from other VCS ones, they use the rsync + # protocol, a list of modules needs to be fetched from an info page + # and multiple origin URLs can be produced for a same project. + cvs_info_url = f"http://{project}.cvs.sourceforge.net" + try: + response = self.page_request(cvs_info_url, params={}) + except requests.HTTPError: + logger.warning( + "CVS info page could not be fetched, skipping project '%s'", + project, + ) + continue + else: + bs = BeautifulSoup(response.text, features="html.parser") + cvs_base_url = "rsync://a.cvs.sourceforge.net/cvsroot" + for text in [b.text for b in bs.find_all("b")]: + match = re.search(fr".*/cvsroot/{project} co -P (.+)", text) + if match is not None: + module = match.group(1) + url = f"{cvs_base_url}/{project}/{module}" + hits.append( + SourceForgeListerEntry( + vcs=VcsNames(tool_name), + url=url, + last_modified=last_modified, + ) + ) + continue url = CLONE_URL_FORMAT.format( vcs=tool_name, namespace=namespace, diff --git a/swh/lister/sourceforge/tests/data/aaron.html b/swh/lister/sourceforge/tests/data/aaron.html new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/aaron.html @@ -0,0 +1,23 @@ + + + + + + CVS Info for project aaron + + + + + + +

The aaron project's CVS data is in read-only mode, so the project may have switched over to another source-code-management system. To check, visit the Project Summary Page for aaron and see if the menubar lists a newer code repository, such as SVN or Git. + +

The CVS data can be accessed as follows. +You can run a per-module CVS checkout via pserver protocol: +

  • cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/aaron co -P aaron
  • +
  • cvs -z3 -d:pserver:anonymous@a.cvs.sourceforge.net:/cvsroot/aaron co -P www
  • +

    You can view a list of files or copy all the CVS repository data via rsync (the 1st command lists the files, the 2nd copies): +

  • rsync -a a.cvs.sourceforge.net::cvsroot/aaron/
  • +
  • rsync -ai a.cvs.sourceforge.net::cvsroot/aaron/ /my/local/dest/dir/
  • + +

    If you are a project admin for aaron, you can request that this page redirect to another repo on your project by submitting a support request. diff --git a/swh/lister/sourceforge/tests/data/aaron.json b/swh/lister/sourceforge/tests/data/aaron.json new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/aaron.json @@ -0,0 +1,236 @@ +{ + "shortname": "aaron", + "name": "Aaron: the app, service, and net monitor", + "_id": "5139010d5fcbc97960fd66bb", + "url": "https://sourceforge.net/p/aaron/", + "private": false, + "short_description": "Aaron is an application, service, and network availability monitoring and alert daemon. Notification of unavailable services, networks, etc., levels is sent to the appropriate roles. Aaron is highly customizable enterprise class monitoring software.", + "creation_date": "2001-06-24", + "summary": "", + "external_homepage": "http://aaron.sourceforge.net", + "video_url": "", + "socialnetworks": [], + "status": "active", + "moved_to_url": "", + "preferred_support_tool": "", + "preferred_support_url": "", + "developers": [ + { + "username": "kapelmeister", + "name": "Steve Nickels", + "url": "https://sourceforge.net/u/kapelmeister/" + }, + { + "username": "thetitan", + "name": "Sean Chittenden", + "url": "https://sourceforge.net/u/thetitan/" + }, + { + "username": "stwalker", + "name": "Scott Walker", + "url": "https://sourceforge.net/u/stwalker/" + } + ], + "tools": [ + { + "name": "support", + "mount_point": "support", + "url": "/p/aaron/support/", + "icons": { + "24": "images/sftheme/24x24/blog_24.png", + "32": "images/sftheme/32x32/blog_32.png", + "48": "images/sftheme/48x48/blog_48.png" + }, + "installable": false, + "tool_label": "Support", + "mount_label": "Support" + }, + { + "name": "mailman", + "mount_point": "mailman", + "url": "/p/aaron/mailman/", + "icons": { + "24": "images/forums_24.png", + "32": "images/forums_32.png", + "48": "images/forums_48.png" + }, + "installable": false, + "tool_label": "Mailing Lists", + "mount_label": "Mailing Lists" + }, + { + "name": "reviews", + "mount_point": "reviews", + "url": "/p/aaron/reviews/", + "icons": { + "24": "images/sftheme/24x24/blog_24.png", + "32": "images/sftheme/32x32/blog_32.png", + "48": "images/sftheme/48x48/blog_48.png" + }, + "installable": false, + "tool_label": "Reviews", + "mount_label": "Reviews" + }, + { + "name": "wiki", + "mount_point": "wiki", + "url": "/p/aaron/wiki/", + "icons": { + "24": "images/wiki_24.png", + "32": "images/wiki_32.png", + "48": "images/wiki_48.png" + }, + "installable": true, + "tool_label": "Wiki", + "mount_label": "Wiki" + }, + { + "name": "summary", + "mount_point": "summary", + "url": "/p/aaron/summary/", + "icons": { + "24": "images/sftheme/24x24/blog_24.png", + "32": "images/sftheme/32x32/blog_32.png", + "48": "images/sftheme/48x48/blog_48.png" + }, + "installable": false, + "tool_label": "Summary", + "mount_label": "Summary", + "sourceforge_group_id": 29993 + }, + { + "name": "files-sf", + "mount_point": "files", + "url": "/p/aaron/files/", + "icons": { + "24": "images/downloads_24.png", + "32": "images/downloads_32.png", + "48": "images/downloads_48.png" + }, + "installable": false, + "tool_label": "Files", + "mount_label": "Files" + }, + { + "name": "cvs", + "mount_point": "code", + "url": "/p/aaron/code/", + "icons": { + "24": "images/code_24.png", + "32": "images/code_32.png", + "48": "images/code_48.png" + }, + "installable": false, + "tool_label": "CVS", + "mount_label": "Code" + }, + { + "name": "activity", + "mount_point": "activity", + "url": "/p/aaron/activity/", + "icons": { + "24": "images/admin_24.png", + "32": "images/admin_32.png", + "48": "images/admin_48.png" + }, + "installable": false, + "tool_label": "Tool", + "mount_label": "Activity" + }, + { + "name": "discussion", + "mount_point": "discussion", + "url": "/p/aaron/discussion/", + "icons": { + "24": "images/forums_24.png", + "32": "images/forums_32.png", + "48": "images/forums_48.png" + }, + "installable": true, + "tool_label": "Discussion", + "mount_label": "Discussion" + } + ], + "labels": [], + "categories": { + "audience": [ + { + "id": 4, + "shortname": "sysadmins", + "fullname": "System Administrators", + "fullpath": "Intended Audience :: by End-User Class :: System Administrators" + } + ], + "developmentstatus": [ + { + "id": 8, + "shortname": "prealpha", + "fullname": "2 - Pre-Alpha", + "fullpath": "Development Status :: 2 - Pre-Alpha" + }, + { + "id": 7, + "shortname": "planning", + "fullname": "1 - Planning", + "fullpath": "Development Status :: 1 - Planning" + } + ], + "environment": [ + { + "id": 238, + "shortname": "daemon", + "fullname": "Non-interactive (Daemon)", + "fullpath": "User Interface :: Non-interactive (Daemon)" + } + ], + "language": [ + { + "id": 164, + "shortname": "c", + "fullname": "C", + "fullpath": "Programming Language :: C" + }, + { + "id": 293, + "shortname": "ruby", + "fullname": "Ruby", + "fullpath": "Programming Language :: Ruby" + } + ], + "license": [ + { + "id": 296, + "shortname": "apache", + "fullname": "Apache Software License", + "fullpath": "License :: OSI-Approved Open Source :: Apache Software License" + } + ], + "translation": [ + { + "id": 275, + "shortname": "english", + "fullname": "English", + "fullpath": "Translations :: English" + } + ], + "os": [ + { + "id": 235, + "shortname": "independent", + "fullname": "OS Independent (Written in an interpreted language)", + "fullpath": "Operating System :: Grouping and Descriptive Categories :: OS Independent (Written in an interpreted language)" + } + ], + "database": [], + "topic": [ + { + "id": 152, + "shortname": "monitoring", + "fullname": "Monitoring", + "fullpath": "Topic :: System :: Networking :: Monitoring" + } + ] + }, + "icon_url": null, + "screenshots": [] +} \ No newline at end of file diff --git a/swh/lister/sourceforge/tests/data/subsitemap-0.xml b/swh/lister/sourceforge/tests/data/subsitemap-0.xml --- a/swh/lister/sourceforge/tests/data/subsitemap-0.xml +++ b/swh/lister/sourceforge/tests/data/subsitemap-0.xml @@ -1,5 +1,20 @@ + + https://sourceforge.net/projects/aaron/files/ + 2013-03-07 + daily + + + https://sourceforge.net/p/aaron/home/ + 2013-03-07 + daily + + + https://sourceforge.net/p/aaron/tickets/ + 2013-03-07 + daily + https://sourceforge.net/projects/os3dmodels/files/ 2017-03-31 diff --git a/swh/lister/sourceforge/tests/test_lister.py b/swh/lister/sourceforge/tests/test_lister.py --- a/swh/lister/sourceforge/tests/test_lister.py +++ b/swh/lister/sourceforge/tests/test_lister.py @@ -26,6 +26,7 @@ from swh.scheduler.model import ListedOrigin TEST_PROJECTS = { + "aaron": "p", "adobexmp": "adobe", "backapps": "p", "backapps/website": "p", @@ -62,6 +63,10 @@ return json.loads(Path(datadir, f"{project}.json").read_text()) +def get_cvs_info_page(datadir): + return Path(datadir, "aaron.html").read_text() + + def _check_request_headers(request): return request.headers.get("User-Agent") == USER_AGENT @@ -81,6 +86,8 @@ "https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"), "http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"), "http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"), + "rsync://a.cvs.sourceforge.net/cvsroot/aaron/aaron": ("cvs", "2013-03-07"), + "rsync://a.cvs.sourceforge.net/cvsroot/aaron/www": ("cvs", "2013-03-07"), } @@ -114,6 +121,11 @@ json=functools.partial(get_project_json, datadir), additional_matcher=_check_request_headers, ) + requests_mock.get( + re.compile("http://aaron.cvs.sourceforge.net/"), + text=get_cvs_info_page(datadir), + additional_matcher=_check_request_headers, + ) stats = lister.run() # - os3dmodels (2 repos), @@ -123,8 +135,8 @@ # - random-mercurial (1 repo). # - bzr-repo (1 repo). # adobe and backapps itself have no repos. - assert stats.pages == 6 - assert stats.origins == 11 + assert stats.pages == 7 + assert stats.origins == 13 expected_state = { "subsitemap_last_modified": { "https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18", @@ -178,6 +190,12 @@ additional_matcher=_check_request_headers, ) + requests_mock.get( + re.compile("http://aaron.cvs.sourceforge.net/"), + text=get_cvs_info_page(datadir), + additional_matcher=_check_request_headers, + ) + faked_listed_origins = [ # mramm: changed ListedOrigin( @@ -272,8 +290,8 @@ stats = lister.run() # - mramm (3 repos), # changed - assert stats.pages == 1 - assert stats.origins == 3 + assert stats.pages == 2 + assert stats.origins == 5 expected_state = { "subsitemap_last_modified": { "https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18", @@ -322,6 +340,12 @@ additional_matcher=_check_request_headers, ) + requests_mock.get( + re.compile("http://aaron.cvs.sourceforge.net/"), + text=get_cvs_info_page(datadir), + additional_matcher=_check_request_headers, + ) + stats = lister.run() # - os3dmodels (2 repos), # - mramm (3 repos), @@ -330,23 +354,10 @@ # - random-mercurial (1 repo). # - bzr-repo (1 repo). # adobe and backapps itself have no repos. - assert stats.pages == 6 - assert stats.origins == 11 + assert stats.pages == 7 + assert stats.origins == 13 - scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - assert {o.url: o.visit_type for o in scheduler_origins} == { - "https://svn.code.sf.net/p/backapps/website/code": "svn", - "https://git.code.sf.net/p/os3dmodels/git": "git", - "https://svn.code.sf.net/p/os3dmodels/svn": "svn", - "https://git.code.sf.net/p/mramm/files": "git", - "https://git.code.sf.net/p/mramm/git": "git", - "https://svn.code.sf.net/p/mramm/svn": "svn", - "https://git.code.sf.net/p/mojunk/git": "git", - "https://git.code.sf.net/p/mojunk/git2": "git", - "https://svn.code.sf.net/p/mojunk/svn": "svn", - "http://hg.code.sf.net/p/random-mercurial/hg": "hg", - "http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": "bzr", - } + _check_listed_origins(lister, swh_scheduler) # Test `time.sleep` is called with exponential retries assert_sleep_calls(mocker, mocked_sleep, [1, WAIT_EXP_BASE, 1, 1]) @@ -408,6 +419,11 @@ re.compile("https://sourceforge.net/rest/p/mramm"), status_code=status_code ) + # Make request to CVS info page fail + requests_mock.get( + re.compile("http://aaron.cvs.sourceforge.net/"), status_code=status_code + ) + stats = lister.run() # - os3dmodels (2 repos), # - mojunk (3 repos),