diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -84,6 +84,9 @@ # Predictable URL for cloning (in the broad sense) a VCS registered for the project. # +# Warning: does not apply to bzr repos, and Mercurial are http only, see use of this +# constant below. +# # `vcs`: VCS type, one of `VCS_NAMES` # `namespace`: Project namespace. Very often `p`, but can be something else like # `adobe`. @@ -170,13 +173,24 @@ url_match = re.compile( r".*\.code\.sf\.net/(?P[^/]+)/(?P.+)/.*" ) + bzr_url_match = re.compile( + r"http://(?P[^/]+).bzr.sourceforge.net/bzrroot/([^/]+)" + ) + for origin in stream: url = origin.url match = url_match.match(url) - assert match is not None - matches = match.groupdict() - namespace = matches["namespace"] - project = matches["project"] + if match is None: + # Should be a bzr special endpoint + match = bzr_url_match.match(url) + assert match is not None + matches = match.groupdict() + project = matches["project"] + namespace = "p" # no special namespacing for bzr projects + else: + matches = match.groupdict() + namespace = matches["namespace"] + project = matches["project"] # "Last modified" dates are the same across all VCS (tools, even) # within a project or subproject. An assertion here would be overkill. last_modified = origin.last_update @@ -356,6 +370,11 @@ # SourceForge does not yet support anonymous HTTPS cloning for Mercurial # See https://sourceforge.net/p/forge/feature-requests/727/ url = url.replace("https://", "http://") + if tool_name == VcsNames.BAZAAR.value: + # SourceForge has removed support for bzr and only keeps legacy projects + # around at a separate (also not https) URL. Bzr projects are very rare + # and a lot of them are 404 now. + url = f"http://{project}.bzr.sourceforge.net/bzrroot/{project}" entry = SourceForgeListerEntry( vcs=VcsNames(tool_name), url=url, last_modified=last_modified ) diff --git a/swh/lister/sourceforge/tests/data/bzr-repo.json b/swh/lister/sourceforge/tests/data/bzr-repo.json new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/bzr-repo.json @@ -0,0 +1,53 @@ +{ + "shortname": "bzr-repo", + "name": "Bazaar repo", + "_id": "4bf3fc291be1ce2f10000052", + "url": "https://sourceforge.net/p/bzr-repo/", + "private": false, + "short_description": "This is an example bzr project", + "creation_date": "2009-10-10", + "summary": "", + "external_homepage": "", + "video_url": "", + "socialnetworks": [], + "status": "active", + "moved_to_url": "", + "preferred_support_tool": "", + "preferred_support_url": "", + "developers": [ + { + "username": "Alphare", + "name": "Raphaël Gomès", + "url": "https://sourceforge.net/u/alphare/" + } + ], + "tools": [ + { + "name": "bzr", + "mount_point": "bzr", + "url": "/p/bzr-repo/bazaar/", + "icons": { + "24": "images/code_24.png", + "32": "images/code_32.png", + "48": "images/code_48.png" + }, + "installable": true, + "tool_label": "Bazaar", + "mount_label": "Bazaar" + } + ], + "labels": [], + "categories": { + "audience": [], + "developmentstatus": [], + "environment": [], + "language": [], + "license": [], + "translation": [], + "os": [], + "database": [], + "topic": [] + }, + "icon_url": null, + "screenshots": [] + } diff --git a/swh/lister/sourceforge/tests/data/subsitemap-1.xml b/swh/lister/sourceforge/tests/data/subsitemap-1.xml --- a/swh/lister/sourceforge/tests/data/subsitemap-1.xml +++ b/swh/lister/sourceforge/tests/data/subsitemap-1.xml @@ -40,4 +40,9 @@ 2019-05-02 daily + + https://sourceforge.net/p/bzr-repo/ + 2021-01-27 + daily + diff --git a/swh/lister/sourceforge/tests/test_lister.py b/swh/lister/sourceforge/tests/test_lister.py --- a/swh/lister/sourceforge/tests/test_lister.py +++ b/swh/lister/sourceforge/tests/test_lister.py @@ -29,6 +29,7 @@ "adobexmp": "adobe", "backapps": "p", "backapps/website": "p", + "bzr-repo": "p", "mojunk": "p", "mramm": "p", "os3dmodels": "p", @@ -79,6 +80,7 @@ "https://git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"), "https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"), "http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"), + "http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"), } @@ -119,9 +121,10 @@ # - mojunk (3 repos), # - backapps/website (1 repo), # - random-mercurial (1 repo). + # - bzr-repo (1 repo). # adobe and backapps itself have no repos. - assert stats.pages == 5 - assert stats.origins == 10 + assert stats.pages == 6 + assert stats.origins == 11 expected_state = { "subsitemap_last_modified": { "https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18", @@ -239,6 +242,12 @@ url="http://hg.code.sf.net/p/random-mercurial/hg", last_update=iso8601.parse_date("2019-05-02"), ), + ListedOrigin( + lister_id=lister.lister_obj.id, + visit_type="bzr", + url="http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo", + last_update=iso8601.parse_date("2021-01-27"), + ), ] swh_scheduler.record_listed_origins(faked_listed_origins) @@ -319,9 +328,10 @@ # - mojunk (3 repos), # - backapps/website (1 repo), # - random-mercurial (1 repo). + # - bzr-repo (1 repo). # adobe and backapps itself have no repos. - assert stats.pages == 5 - assert stats.origins == 10 + assert stats.pages == 6 + assert stats.origins == 11 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert {o.url: o.visit_type for o in scheduler_origins} == { @@ -335,6 +345,7 @@ "https://git.code.sf.net/p/mojunk/git2": "git", "https://svn.code.sf.net/p/mojunk/svn": "svn", "http://hg.code.sf.net/p/random-mercurial/hg": "hg", + "http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": "bzr", } # Test `time.sleep` is called with exponential retries @@ -402,10 +413,11 @@ # - mojunk (3 repos), # - backapps/website (1 repo), # - random-mercurial (1 repo). + # - bzr-repo (1 repo). # adobe and backapps itself have no repos. # Did *not* list mramm - assert stats.pages == 4 - assert stats.origins == 7 + assert stats.pages == 5 + assert stats.origins == 8 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins} @@ -418,4 +430,5 @@ "https://git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"), "https://svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"), "http://hg.code.sf.net/p/random-mercurial/hg": ("hg", "2019-05-02"), + "http://bzr-repo.bzr.sourceforge.net/bzrroot/bzr-repo": ("bzr", "2021-01-27"), }