diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -68,6 +68,7 @@ lister.packagist=swh.lister.packagist:register lister.phabricator=swh.lister.phabricator:register lister.pypi=swh.lister.pypi:register + lister.sourceforge=swh.lister.sourceforge:register """, classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/lister/sourceforge/__init__.py b/swh/lister/sourceforge/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2021 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import SourceForgeLister + + return { + "lister": SourceForgeLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/lister.py @@ -0,0 +1,224 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from dataclasses import dataclass +import datetime +from enum import Enum +import logging +import re +from typing import Iterator, List, Set +from xml.etree import ElementTree + +import iso8601 +import requests +from tenacity.before_sleep import before_sleep_log + +from swh.lister.utils import throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import StatelessLister + +logger = logging.getLogger(__name__) + + +class VcsNames(Enum): + """Used to filter SourceForge tool names for valid VCS types""" + + # CVS projects are read-only + CVS = "cvs" + GIT = "git" + SUBVERSION = "svn" + MERCURIAL = "hg" + BAZAAR = "bzr" + + +VCS_NAMES = set(v.value for v in VcsNames.__members__.values()) + + +@dataclass +class SourceForgeListerEntry: + vcs: VcsNames + url: str + last_modified: datetime.date + + +SourceForgeListerPage = List[SourceForgeListerEntry] + +MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml" +SITEMAP_XML_NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}" + +# REST resource endpoint for information about the given project. +# +# `namespace`: Project namespace. Very often `p`, but can be something else like +# `adobe` +# `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`. +PROJECT_REST_URL_FORMAT = "https://sourceforge.net/rest/{namespace}/{project}" + +# Predictable URL for cloning (in the broad sense) a VCS registered for the project. +# +# `vcs`: VCS type, one of `VCS_NAMES` +# `namespace`: Project namespace. Very often `p`, but can be something else like +# `adobe`. +# `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`. +# `mount_point`: url path used by the repo. For example, the Code::Blocks project uses +# `git` (https://git.code.sf.net/p/codeblocks/git). +CLONE_URL_FORMAT = "{vcs}.code.sf.net/{namespace}/{project}/{mount_point}" + +PROJ_URL_RE = re.compile( + r"^https://sourceforge.net/(?P[^/]+)/(?P[^/]+)/(?P.*)?" +) + + +class SourceForgeLister(StatelessLister[SourceForgeListerPage]): + """List origins from the "SourceForge" forge. + + """ + + # Part of the lister API, that identifies this lister + LISTER_NAME = "sourceforge" + + def __init__(self, scheduler: SchedulerInterface): + super().__init__( + scheduler=scheduler, url="https://sourceforge.net", instance="main" + ) + + self.session = requests.Session() + # Declare the USER_AGENT is more sysadm-friendly for the forge we list + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT} + ) + + @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) + def page_request(self, url, params) -> requests.Response: + # Log listed URL to ease debugging + logger.debug("Fetching URL %s with params %s", url, params) + response = self.session.get(url, params=params) + + if response.status_code != 200: + # Log response content to ease debugging + logger.warning( + "Unexpected HTTP status code %s on %s: %s", + response.status_code, + response.url, + response.content, + ) + # The lister must fail on blocking errors + response.raise_for_status() + + return response + + def get_pages(self) -> Iterator[SourceForgeListerPage]: + """ + SourceForge has a main XML sitemap that lists its sharded sitemaps for all + projects. + Each XML sub-sitemap lists project pages, which are not unique per project: a + project can have a wiki, a home, a git, an svn, etc. + For each unique project, we query a REST endpoint that lists (among + other things) the tools associated with said project, some of which are + the VCS used. Subprojects are considered separate projects. + Lastly we use the information of which VCS are used to build the predictable + clone URL for any given VCS. + """ + sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text + tree = ElementTree.fromstring(sitemap_contents) + + for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"): + # TODO use when adding incremental support + # last_modified = sub_sitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod") + location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc") + assert location is not None + sub_url = location.text + subsitemap_contents = self.page_request(sub_url, {}).text + subtree = ElementTree.fromstring(subsitemap_contents) + + yield from self._get_pages_from_subsitemap(subtree) + + def get_origins_from_page( + self, page: SourceForgeListerPage + ) -> Iterator[ListedOrigin]: + assert self.lister_obj.id is not None + for hit in page: + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=hit.vcs.value, + url=hit.url, + last_update=iso8601.parse_date(hit.last_modified), + ) + + def _get_pages_from_subsitemap( + self, subtree: ElementTree.Element + ) -> Iterator[SourceForgeListerPage]: + projects: Set[str] = set() + for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"): + last_modified_block = project_block.find(f"{SITEMAP_XML_NAMESPACE}lastmod") + assert last_modified_block is not None + last_modified = last_modified_block.text + location = project_block.find(f"{SITEMAP_XML_NAMESPACE}loc") + assert location is not None + project_url = location.text + assert project_url is not None + + match = PROJ_URL_RE.match(project_url) + if match: + matches = match.groupdict() + namespace = matches["namespace"] + if namespace == "projects": + # These have a `p`-namespaced counterpart, use that instead + continue + + project = matches["project"] + rest = matches["rest"] + if rest.count("/") > 1: + # This is a subproject. There exists no sub-subprojects. + subproject_name = rest.rsplit("/", 2)[0] + project = f"{project}/{subproject_name}" + + prev_len = len(projects) + projects.add(project) + + if prev_len == len(projects): + # Already seen + continue + + pages = self._get_pages_for_project(namespace, project, last_modified) + if pages: + yield pages + else: + logger.debug("Project '%s' does not have any VCS", project) + else: + # Should always match, let's log it + msg = "Project URL '%s' does not match expected pattern" + logger.warning(msg, project_url) + + def _get_pages_for_project( + self, namespace, project, last_modified + ) -> SourceForgeListerPage: + endpoint = PROJECT_REST_URL_FORMAT.format(namespace=namespace, project=project) + res = self.page_request(endpoint, {}).json() + + tools = res.get("tools") + if tools is None: + # This probably never happens + logger.warning("Project '%s' does not have any tools", endpoint) + return [] + + hits = [] + for tool in tools: + tool_name = tool["name"] + if tool_name not in VCS_NAMES: + continue + url = CLONE_URL_FORMAT.format( + vcs=tool_name, + namespace=namespace, + project=project, + mount_point=tool["mount_point"], + ) + entry = SourceForgeListerEntry( + vcs=VcsNames(tool_name), url=url, last_modified=last_modified + ) + hits.append(entry) + + return hits diff --git a/swh/lister/sourceforge/tasks.py b/swh/lister/sourceforge/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tasks.py @@ -0,0 +1,20 @@ +# Copyright (C) 2019-2021 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict + +from celery import shared_task + +from swh.lister.sourceforge.lister import SourceForgeLister + + +@shared_task(name=__name__ + ".FullSourceForgeLister") +def list_sourceforge_full() -> Dict[str, int]: + """Full update of a SourceForge instance""" + return SourceForgeLister.from_configfile().run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/sourceforge/tests/__init__.py b/swh/lister/sourceforge/tests/__init__.py new file mode 100644 diff --git a/swh/lister/sourceforge/tests/data/adobexmp.json b/swh/lister/sourceforge/tests/data/adobexmp.json new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/adobexmp.json @@ -0,0 +1 @@ +{"shortname": "adobexmp", "name": "Extensible Metadata Platform (XMP)", "_id": "4bfa89ecb9363c60a200004f", "url": "https://sourceforge.net/adobe/adobexmp/", "private": false, "short_description": "Adobe's Extensible Metadata Platform (XMP) is a labeling technology that allows you to embed data about a file, known as metadata, into the file itself. ", "creation_date": "2010-05-24", "summary": "", "external_homepage": "", "video_url": "", "socialnetworks": [], "status": "active", "moved_to_url": "", "preferred_support_tool": "", "preferred_support_url": "", "developers": [{"username": "stefanmakswit", "name": "Stefan Makswit", "url": "https://sourceforge.net/u/stefanmakswit/"}, {"username": "samymakki", "name": "Samy Makki", "url": "https://sourceforge.net/u/samymakki/"}, {"username": "frankbiederich", "name": "Frank Biederich", "url": "https://sourceforge.net/u/frankbiederich/"}, {"username": "adobeadmin", "name": "Adobe Admin", "url": "https://sourceforge.net/u/adobeadmin/"}, {"username": "n_oostendorp", "name": "Nathan Oostendorp", "url": "https://sourceforge.net/u/n_oostendorp/"}, {"username": "joergehrlich", "name": "J\u00f6rg Ehrlich", "url": "https://sourceforge.net/u/joergehrlich/"}], "tools": [{"name": "discussion", "mount_point": "discussion", "url": "/adobe/adobexmp/discussion/", "icons": {"24": "images/forums_24.png", "32": "images/forums_32.png", "48": "images/forums_48.png"}, "installable": true, "tool_label": "Discussion", "mount_label": "Discussion"}, {"name": "wiki", "mount_point": "wiki", "url": "/adobe/adobexmp/wiki/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Wiki"}, {"name": "wiki", "mount_point": "home", "url": "/adobe/adobexmp/home/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Home"}], "labels": ["xmp", "metadata", "adobe"], "categories": {"audience": [], "developmentstatus": [], "environment": [], "language": [], "license": [], "translation": [], "os": [], "database": [], "topic": []}, "icon_url": "https://sourceforge.net/adobe/adobexmp/icon", "screenshots": []} diff --git a/swh/lister/sourceforge/tests/data/backapps-website.json b/swh/lister/sourceforge/tests/data/backapps-website.json new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/backapps-website.json @@ -0,0 +1 @@ +{"shortname": "backapps/website", "name": "BackApps website", "_id": "4e5b4c310594ca11c1000f67", "url": "https://sourceforge.net/p/backapps/website/", "private": true, "short_description": "BackApps website is the front end of the BackApps service that supplies information for mobile application developers of pros and cons of BackApps service and how to use it. ", "creation_date": "2011-08-29", "summary": "", "external_homepage": "www.backapps.com", "video_url": "", "socialnetworks": [], "status": "active", "moved_to_url": "", "preferred_support_tool": "_url", "preferred_support_url": "www.backapps.com", "developers": [{"username": "shaiamar", "name": "Shai", "url": "https://sourceforge.net/u/shaiamar/"}], "tools": [{"name": "svn", "mount_point": "code", "url": "/p/backapps/website/code/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "SVN", "mount_label": "Code"}, {"name": "wiki", "mount_point": "home", "url": "/p/backapps/website/home/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Home"}, {"name": "reviews", "mount_point": "reviews", "url": "/p/backapps/website/reviews/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Reviews", "mount_label": "Reviews"}, {"name": "summary", "mount_point": "summary", "url": "/p/backapps/website/summary/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Summary", "mount_label": "Summary", "sourceforge_group_id": 586632}, {"name": "support", "mount_point": "support", "url": "/p/backapps/website/support/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Support", "mount_label": "Support"}, {"name": "files", "mount_point": "files", "url": "/p/backapps/website/files/", "icons": {"24": "images/downloads_24.png", "32": "images/downloads_32.png", "48": "images/downloads_48.png"}, "installable": false, "tool_label": "Files", "mount_label": "Files"}, {"name": "activity", "mount_point": "activity", "url": "/p/backapps/website/activity/", "icons": {"24": "images/admin_24.png", "32": "images/admin_32.png", "48": "images/admin_48.png"}, "installable": false, "tool_label": "Tool", "mount_label": "Activity"}], "labels": ["backapps", "mobile", "data sharing", "iphone", "ios", "android", "java", "server", "social", "application server", "social application server"], "categories": {"audience": [{"id": 5, "shortname": "other", "fullname": "Other Audience", "fullpath": "Intended Audience :: Other Audience"}], "developmentstatus": [{"id": 9, "shortname": "alpha", "fullname": "3 - Alpha", "fullpath": "Development Status :: 3 - Alpha"}], "environment": [{"id": 237, "shortname": "web", "fullname": "Web-based", "fullpath": "User Interface :: Web-based"}], "language": [{"id": 198, "shortname": "java", "fullname": "Java", "fullpath": "Programming Language :: Java"}], "license": [{"id": 16, "shortname": "lgpl", "fullname": "GNU Library or Lesser General Public License version 2.0 (LGPLv2)", "fullpath": "License :: OSI-Approved Open Source :: GNU Library or Lesser General Public License version 2.0 (LGPLv2)"}], "translation": [], "os": [], "database": [{"id": 502, "shortname": "db_api_jdbc", "fullname": "JDBC", "fullpath": "Database Environment :: Database API :: JDBC"}], "topic": [{"id": 68, "shortname": "frontends", "fullname": "Front-Ends", "fullpath": "Topic :: Database :: Front-Ends"}, {"id": 606, "shortname": "frameworks", "fullname": "Frameworks", "fullpath": "Topic :: Software Development :: Frameworks"}]}, "icon_url": null, "screenshots": []} diff --git a/swh/lister/sourceforge/tests/data/backapps.json b/swh/lister/sourceforge/tests/data/backapps.json new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/backapps.json @@ -0,0 +1 @@ +{"shortname": "backapps", "name": "BackApps", "_id": "4e5aac30b9363c3ef60008ea", "url": "https://sourceforge.net/p/backapps/", "private": false, "short_description": "BackApps is an innovative service designed for smart phones applications developers that allows data sharing between application users.", "creation_date": "2011-08-28", "summary": "", "external_homepage": "www.backapps.com", "video_url": "", "socialnetworks": [], "status": "active", "moved_to_url": "", "preferred_support_tool": "home", "preferred_support_url": "", "developers": [{"username": "shaiamar", "name": "Shai", "url": "https://sourceforge.net/u/shaiamar/"}], "tools": [{"name": "discussion", "mount_point": "discussion", "url": "/p/backapps/discussion/", "icons": {"24": "images/forums_24.png", "32": "images/forums_32.png", "48": "images/forums_48.png"}, "installable": true, "tool_label": "Discussion", "mount_label": "Discussion"}, {"name": "files", "mount_point": "files", "url": "/p/backapps/files/", "icons": {"24": "images/downloads_24.png", "32": "images/downloads_32.png", "48": "images/downloads_48.png"}, "installable": false, "tool_label": "Files", "mount_label": "Files"}, {"name": "tickets", "mount_point": "tickets", "url": "/p/backapps/tickets/", "icons": {"24": "images/tickets_24.png", "32": "images/tickets_32.png", "48": "images/tickets_48.png"}, "installable": true, "tool_label": "Tickets", "mount_label": "Tickets"}, {"name": "wiki", "mount_point": "wiki", "url": "/p/backapps/wiki/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Wiki"}, {"name": "wiki", "mount_point": "home", "url": "/p/backapps/home/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Home"}, {"name": "blog", "mount_point": "blog", "url": "/p/backapps/blog/", "icons": {"24": "images/blog_24.png", "32": "images/blog_32.png", "48": "images/blog_48.png"}, "installable": true, "tool_label": "Blog", "mount_label": "Blog"}, {"name": "reviews", "mount_point": "reviews", "url": "/p/backapps/reviews/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Reviews", "mount_label": "Reviews"}, {"name": "summary", "mount_point": "summary", "url": "/p/backapps/summary/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Summary", "mount_label": "Summary", "sourceforge_group_id": 586397}, {"name": "support", "mount_point": "support", "url": "/p/backapps/support/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Support", "mount_label": "Support"}, {"name": "activity", "mount_point": "activity", "url": "/p/backapps/activity/", "icons": {"24": "images/admin_24.png", "32": "images/admin_32.png", "48": "images/admin_48.png"}, "installable": false, "tool_label": "Tool", "mount_label": "Activity"}], "labels": ["mobile", "data sharing", "iphone", "ios", "android", "java", "objective c", "server", "social", "application server", "social application server"], "categories": {"audience": [{"id": 5, "shortname": "other", "fullname": "Other Audience", "fullpath": "Intended Audience :: Other Audience"}], "developmentstatus": [{"id": 8, "shortname": "prealpha", "fullname": "2 - Pre-Alpha", "fullpath": "Development Status :: 2 - Pre-Alpha"}], "environment": [{"id": 237, "shortname": "web", "fullname": "Web-based", "fullpath": "User Interface :: Web-based"}], "language": [{"id": 174, "shortname": "objectivec", "fullname": "Objective C", "fullpath": "Programming Language :: Objective C"}, {"id": 198, "shortname": "java", "fullname": "Java", "fullpath": "Programming Language :: Java"}], "license": [{"id": 16, "shortname": "lgpl", "fullname": "GNU Library or Lesser General Public License version 2.0 (LGPLv2)", "fullpath": "License :: OSI-Approved Open Source :: GNU Library or Lesser General Public License version 2.0 (LGPLv2)"}], "translation": [{"id": 275, "shortname": "english", "fullname": "English", "fullpath": "Translations :: English"}], "os": [], "database": [{"id": 502, "shortname": "db_api_jdbc", "fullname": "JDBC", "fullpath": "Database Environment :: Database API :: JDBC"}], "topic": [{"id": 67, "shortname": "engines", "fullname": "Database Engines/Servers", "fullpath": "Topic :: Database :: Database Engines/Servers"}, {"id": 606, "shortname": "frameworks", "fullname": "Frameworks", "fullpath": "Topic :: Software Development :: Frameworks"}]}, "icon_url": "https://sourceforge.net/p/backapps/icon", "screenshots": [{"url": "https://sourceforge.net/p/backapps/screenshot/BackAppsWebsiteWhatIsBackApps.png", "thumbnail_url": "https://sourceforge.net/p/backapps/screenshot/BackAppsWebsiteWhatIsBackApps.png/thumb", "caption": "BackApps website - What is BackApps?"}, {"url": "https://sourceforge.net/p/backapps/screenshot/BackAppsWebsiteHomepage.png", "thumbnail_url": "https://sourceforge.net/p/backapps/screenshot/BackAppsWebsiteHomepage.png/thumb", "caption": "BackApps website home page"}, {"url": "https://sourceforge.net/p/backapps/screenshot/BackAppsWebsiteRegisterNow.png", "thumbnail_url": "https://sourceforge.net/p/backapps/screenshot/BackAppsWebsiteRegisterNow.png/thumb", "caption": "BackApps website - Register now"}]} diff --git a/swh/lister/sourceforge/tests/data/main-sitemap.xml b/swh/lister/sourceforge/tests/data/main-sitemap.xml new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/main-sitemap.xml @@ -0,0 +1,10 @@ + + + + https://sourceforge.net/allura_sitemap/sitemap-0.xml + 2021-03-18 + + https://sourceforge.net/allura_sitemap/sitemap-1.xml + 2021-03-18 + + diff --git a/swh/lister/sourceforge/tests/data/mojunk.json b/swh/lister/sourceforge/tests/data/mojunk.json new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/mojunk.json @@ -0,0 +1 @@ +{"shortname": "mojunk", "name": "mojunk", "_id": "4c34ecc60594ca5c18000572", "url": "https://sourceforge.net/p/mojunk/", "private": false, "short_description": "This is a test project", "creation_date": "2010-07-07", "summary": "", "external_homepage": "", "video_url": "", "socialnetworks": [], "status": "active", "moved_to_url": "", "preferred_support_tool": "", "preferred_support_url": "", "developers": [{"username": "matthewmoore", "name": "Matthew S. Moore", "url": "https://sourceforge.net/u/matthewmoore/"}], "tools": [{"name": "svn", "mount_point": "svn", "url": "/p/mojunk/svn/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "SVN", "mount_label": "Svn"}, {"name": "git", "mount_point": "git", "url": "/p/mojunk/git/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "Git", "mount_label": "Git"}, {"name": "git", "mount_point": "git2", "url": "/p/mojunk/git2/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "Git", "mount_label": "Git2-Label"}, {"name": "wiki", "mount_point": "home", "url": "/p/mojunk/home/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Home"}, {"name": "support", "mount_point": "support", "url": "/p/mojunk/support/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Support", "mount_label": "Support"}, {"name": "summary", "mount_point": "summary", "url": "/p/mojunk/summary/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Summary", "mount_label": "Summary", "sourceforge_group_id": 333464}, {"name": "files", "mount_point": "files", "url": "/p/mojunk/files/", "icons": {"24": "images/downloads_24.png", "32": "images/downloads_32.png", "48": "images/downloads_48.png"}, "installable": false, "tool_label": "Files", "mount_label": "Files"}, {"name": "reviews", "mount_point": "reviews", "url": "/p/mojunk/reviews/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Reviews", "mount_label": "Reviews"}, {"name": "activity", "mount_point": "activity", "url": "/p/mojunk/activity/", "icons": {"24": "images/admin_24.png", "32": "images/admin_32.png", "48": "images/admin_48.png"}, "installable": false, "tool_label": "Tool", "mount_label": "Activity"}], "labels": [""], "categories": {"audience": [], "developmentstatus": [], "environment": [], "language": [], "license": [], "translation": [], "os": [], "database": [], "topic": []}, "icon_url": null, "screenshots": []} diff --git a/swh/lister/sourceforge/tests/data/mramm.json b/swh/lister/sourceforge/tests/data/mramm.json new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/mramm.json @@ -0,0 +1 @@ +{"shortname": "mramm", "name": "mramm", "_id": "4bf5c0b51be1ce31a900028f", "url": "https://sourceforge.net/p/mramm/", "private": false, "short_description": "", "creation_date": "2010-11-10", "summary": "", "external_homepage": "", "video_url": "", "socialnetworks": [], "status": "active", "moved_to_url": "", "preferred_support_tool": "", "preferred_support_url": "", "developers": [{"username": "noostendorp", "name": "Nathan Oostendorp", "url": "https://sourceforge.net/u/noostendorp/"}, {"username": "rick446", "name": "Rick Copeland \u2615", "url": "https://sourceforge.net/u/rick446/"}, {"username": "jonathanbeard", "name": "Jonathan T. Beard", "url": "https://sourceforge.net/u/jonathanbeard/"}, {"username": "mramm", "name": "Mark Ramm", "url": "https://sourceforge.net/u/mramm/"}, {"username": "yesjustwolf", "name": "Wolf ", "url": "https://sourceforge.net/u/yesjustwolf/"}, {"username": "robinbriggs", "name": "Robin Briggs", "url": "https://sourceforge.net/u/robinbriggs/"}], "tools": [{"name": "wiki", "mount_point": "reviews", "url": "/p/mramm/reviews/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Reviews"}, {"name": "tickets", "mount_point": "todo", "url": "/p/mramm/todo/", "icons": {"24": "images/tickets_24.png", "32": "images/tickets_32.png", "48": "images/tickets_48.png"}, "installable": true, "tool_label": "Tickets", "mount_label": "Todo"}, {"name": "wiki", "mount_point": "notes", "url": "/p/mramm/notes/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Notes"}, {"name": "discussion", "mount_point": "discussion", "url": "/p/mramm/discussion/", "icons": {"24": "images/forums_24.png", "32": "images/forums_32.png", "48": "images/forums_48.png"}, "installable": true, "tool_label": "Discussion", "mount_label": "Discussion"}, {"name": "git", "mount_point": "files", "url": "/p/mramm/files/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "Git", "mount_label": "Files"}, {"name": "svn", "mount_point": "svn", "url": "/p/mramm/svn/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "SVN", "mount_label": "SVN"}, {"name": "git", "mount_point": "git", "url": "/p/mramm/git/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "Git", "mount_label": "Git"}, {"name": "wiki", "mount_point": "home", "url": "/p/mramm/home/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Home"}, {"name": "summary", "mount_point": "summary", "url": "/p/mramm/summary/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Summary", "mount_label": "Summary", "sourceforge_group_id": 372420}, {"name": "support", "mount_point": "support", "url": "/p/mramm/support/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Support", "mount_label": "Support"}, {"name": "activity", "mount_point": "activity", "url": "/p/mramm/activity/", "icons": {"24": "images/admin_24.png", "32": "images/admin_32.png", "48": "images/admin_48.png"}, "installable": false, "tool_label": "Tool", "mount_label": "Activity"}], "labels": [""], "categories": {"audience": [], "developmentstatus": [], "environment": [], "language": [], "license": [], "translation": [], "os": [], "database": [], "topic": []}, "icon_url": null, "screenshots": []} diff --git a/swh/lister/sourceforge/tests/data/os3dmodels.json b/swh/lister/sourceforge/tests/data/os3dmodels.json new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/os3dmodels.json @@ -0,0 +1 @@ +{"shortname": "os3dmodels", "name": "Open Source 3D Models", "_id": "4bf3fc291be1ce2f10000050", "url": "https://sourceforge.net/p/os3dmodels/", "private": false, "short_description": "This is a set of parametric 3D printable models created for the RepRap/Makerbot", "creation_date": "2010-11-10", "summary": "", "external_homepage": "", "video_url": "", "socialnetworks": [], "status": "active", "moved_to_url": "", "preferred_support_tool": "", "preferred_support_url": "", "developers": [{"username": "n_oostendorp", "name": "Nathan Oostendorp", "url": "https://sourceforge.net/u/n_oostendorp/"}], "tools": [{"name": "tickets", "mount_point": "tickets", "url": "/p/os3dmodels/tickets/", "icons": {"24": "images/tickets_24.png", "32": "images/tickets_32.png", "48": "images/tickets_48.png"}, "installable": true, "tool_label": "Tickets", "mount_label": "Tickets"}, {"name": "git", "mount_point": "git", "url": "/p/os3dmodels/git/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "Git", "mount_label": "Git"}, {"name": "svn", "mount_point": "svn", "url": "/p/os3dmodels/svn/", "icons": {"24": "images/code_24.png", "32": "images/code_32.png", "48": "images/code_48.png"}, "installable": true, "tool_label": "SVN", "mount_label": "Svn"}, {"name": "wiki", "mount_point": "home", "url": "/p/os3dmodels/home/", "icons": {"24": "images/wiki_24.png", "32": "images/wiki_32.png", "48": "images/wiki_48.png"}, "installable": true, "tool_label": "Wiki", "mount_label": "Home"}, {"name": "reviews", "mount_point": "reviews", "url": "/p/os3dmodels/reviews/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Reviews", "mount_label": "Reviews"}, {"name": "support", "mount_point": "support", "url": "/p/os3dmodels/support/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Support", "mount_label": "Support"}, {"name": "files", "mount_point": "files", "url": "/p/os3dmodels/files/", "icons": {"24": "images/downloads_24.png", "32": "images/downloads_32.png", "48": "images/downloads_48.png"}, "installable": false, "tool_label": "Files", "mount_label": "Files"}, {"name": "summary", "mount_point": "summary", "url": "/p/os3dmodels/summary/", "icons": {"24": "images/sftheme/24x24/blog_24.png", "32": "images/sftheme/32x32/blog_32.png", "48": "images/sftheme/48x48/blog_48.png"}, "installable": false, "tool_label": "Summary", "mount_label": "Summary", "sourceforge_group_id": 372436}, {"name": "activity", "mount_point": "activity", "url": "/p/os3dmodels/activity/", "icons": {"24": "images/admin_24.png", "32": "images/admin_32.png", "48": "images/admin_48.png"}, "installable": false, "tool_label": "Tool", "mount_label": "Activity"}], "labels": ["makerbot", "reprap", "3d models"], "categories": {"audience": [], "developmentstatus": [], "environment": [], "language": [], "license": [], "translation": [], "os": [], "database": [], "topic": []}, "icon_url": null, "screenshots": []} diff --git a/swh/lister/sourceforge/tests/data/subsitemap-0.xml b/swh/lister/sourceforge/tests/data/subsitemap-0.xml new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/subsitemap-0.xml @@ -0,0 +1,58 @@ + + + + https://sourceforge.net/projects/os3dmodels/files/ + 2017-03-31 + daily + + + https://sourceforge.net/p/os3dmodels/home/ + 2017-03-31 + daily + + + https://sourceforge.net/p/os3dmodels/tickets/ + 2017-03-31 + daily + + + https://sourceforge.net/p/mramm/home/ + 2019-04-04 + daily + + + https://sourceforge.net/p/mramm/todo/ + 2019-04-04 + daily + + + https://sourceforge.net/p/mramm/notes/ + 2019-04-04 + daily + + + https://sourceforge.net/p/mramm/reviews/ + 2019-04-04 + daily + + + https://sourceforge.net/p/mramm/discussion/ + 2019-04-04 + daily + + + https://sourceforge.net/adobe/adobexmp/home/ + 2017-10-17 + daily + + + https://sourceforge.net/adobe/adobexmp/wiki/ + 2017-10-17 + daily + + + https://sourceforge.net/adobe/adobexmp/discussion/ + 2017-10-17 + daily + + diff --git a/swh/lister/sourceforge/tests/data/subsitemap-1.xml b/swh/lister/sourceforge/tests/data/subsitemap-1.xml new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/data/subsitemap-1.xml @@ -0,0 +1,38 @@ + + + + https://sourceforge.net/projects/backapps/files/ + 2021-02-11 + daily + + + https://sourceforge.net/p/backapps/tickets/ + 2021-02-11 + daily + + + https://sourceforge.net/p/backapps/chat/ + 2021-02-11 + daily + + + https://sourceforge.net/p/backapps/website/files/ + 2021-02-11 + daily + + + https://sourceforge.net/p/backapps/website/tickets/ + 2021-02-11 + daily + + + https://sourceforge.net/projects/mojunk/files/ + 2017-12-31 + daily + + + https://sourceforge.net/p/mojunk/home/ + 2017-12-31 + daily + + diff --git a/swh/lister/sourceforge/tests/test_lister.py b/swh/lister/sourceforge/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/test_lister.py @@ -0,0 +1,180 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import functools +import json +from pathlib import Path +import re + +import pytest +from requests.exceptions import HTTPError + +from swh.lister import USER_AGENT +from swh.lister.sourceforge.lister import ( + MAIN_SITEMAP_URL, + PROJECT_REST_URL_FORMAT, + SourceForgeLister, +) + +# Mapping of project name to namespace +TEST_PROJECTS = { + "adobexmp": "adobe", + "backapps": "p", + "backapps/website": "p", + "mojunk": "p", + "mramm": "p", + "os3dmodels": "p", +} + +URLS_MATCHER = { + PROJECT_REST_URL_FORMAT.format(namespace=namespace, project=project): project + for project, namespace in TEST_PROJECTS.items() +} + + +def get_main_sitemap(datadir): + return Path(datadir, "main-sitemap.xml").read_text() + + +def get_subsitemap_0(datadir): + return Path(datadir, "subsitemap-0.xml").read_text() + + +def get_subsitemap_1(datadir): + return Path(datadir, "subsitemap-1.xml").read_text() + + +def get_project_json(datadir, request, context): + url = request.url + project = URLS_MATCHER.get(url) + assert project is not None, f"Url '{url}' could not be matched" + project = project.replace("/", "-") + return json.loads(Path(datadir, f"{project}.json").read_text()) + + +def _check_request_headers(request): + return request.headers.get("User-Agent") == USER_AGENT + + +def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir): + """ + Simulate a full listing of an artificially restricted sourceforge. + There are 5 different projects, spread over two sub-sitemaps, a few of which + have multiple VCS listed, one has none, one is outside of the standard `/p/` + namespace, some with custom mount points. + All non-interesting but related entries have been kept. + """ + lister = SourceForgeLister(scheduler=swh_scheduler) + + requests_mock.get( + MAIN_SITEMAP_URL, + text=get_main_sitemap(datadir), + additional_matcher=_check_request_headers, + ) + requests_mock.get( + "https://sourceforge.net/allura_sitemap/sitemap-0.xml", + text=get_subsitemap_0(datadir), + additional_matcher=_check_request_headers, + ) + requests_mock.get( + "https://sourceforge.net/allura_sitemap/sitemap-1.xml", + text=get_subsitemap_1(datadir), + additional_matcher=_check_request_headers, + ) + requests_mock.get( + re.compile("https://sourceforge.net/rest/.*"), + json=functools.partial(get_project_json, datadir), + additional_matcher=_check_request_headers, + ) + + stats = lister.run() + # - os3dmodels (2 repos), + # - mramm (3 repos), + # - mojunk (3 repos), + # - backapps/website (1 repo). + # adobe and backapps itself have no repos. + assert stats.pages == 4 + assert stats.origins == 9 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins} + assert res == { + "svn.code.sf.net/p/backapps/website/code": ("svn", "2021-02-11"), + "git.code.sf.net/p/os3dmodels/git": ("git", "2017-03-31"), + "svn.code.sf.net/p/os3dmodels/svn": ("svn", "2017-03-31"), + "git.code.sf.net/p/mramm/files": ("git", "2019-04-04"), + "git.code.sf.net/p/mramm/git": ("git", "2019-04-04"), + "svn.code.sf.net/p/mramm/svn": ("svn", "2019-04-04"), + "git.code.sf.net/p/mojunk/git": ("git", "2017-12-31"), + "git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"), + "svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"), + } + + +def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir): + # Exponential retries take a long time, so stub time.sleep + mocked_sleep = mocker.patch("time.sleep", return_value=None) + + lister = SourceForgeLister(scheduler=swh_scheduler) + + requests_mock.get( + MAIN_SITEMAP_URL, + [ + {"status_code": 429}, + {"status_code": 429}, + {"text": get_main_sitemap(datadir)}, + ], + additional_matcher=_check_request_headers, + ) + requests_mock.get( + "https://sourceforge.net/allura_sitemap/sitemap-0.xml", + [{"status_code": 429}, {"text": get_subsitemap_0(datadir), "status_code": 301}], + additional_matcher=_check_request_headers, + ) + requests_mock.get( + "https://sourceforge.net/allura_sitemap/sitemap-1.xml", + [{"status_code": 429}, {"text": get_subsitemap_1(datadir)}], + additional_matcher=_check_request_headers, + ) + requests_mock.get( + re.compile("https://sourceforge.net/rest/.*"), + [{"status_code": 429}, {"json": functools.partial(get_project_json, datadir)}], + additional_matcher=_check_request_headers, + ) + + stats = lister.run() + # - os3dmodels (2 repos), + # - mramm (3 repos), + # - mojunk (3 repos), + # - backapps/website (1 repo). + # adobe and backapps itself have no repos. + assert stats.pages == 4 + assert stats.origins == 9 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert {o.url: o.visit_type for o in scheduler_origins} == { + "svn.code.sf.net/p/backapps/website/code": "svn", + "git.code.sf.net/p/os3dmodels/git": "git", + "svn.code.sf.net/p/os3dmodels/svn": "svn", + "git.code.sf.net/p/mramm/files": "git", + "git.code.sf.net/p/mramm/git": "git", + "svn.code.sf.net/p/mramm/svn": "svn", + "git.code.sf.net/p/mojunk/git": "git", + "git.code.sf.net/p/mojunk/git2": "git", + "svn.code.sf.net/p/mojunk/svn": "svn", + } + + # Test `time.sleep` is called with exponential retries + calls = [1.0, 10.0, 1.0, 1.0] + mocked_sleep.assert_has_calls([mocker.call(c) for c in calls]) + + +@pytest.mark.parametrize("status_code", [500, 503, 504, 403, 404]) +def test_sourceforge_lister_http_error(swh_scheduler, requests_mock, status_code): + lister = SourceForgeLister(scheduler=swh_scheduler) + + requests_mock.get(MAIN_SITEMAP_URL, status_code=status_code) + + with pytest.raises(HTTPError): + lister.run() diff --git a/swh/lister/sourceforge/tests/test_tasks.py b/swh/lister/sourceforge/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/sourceforge/tests/test_tasks.py @@ -0,0 +1,34 @@ +# Copyright (C) 2019-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_sourceforge_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.sourceforge.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_sourceforge_full_lister_task( + swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker +): + stats = ListerStats(pages=10, origins=900) + mock_lister = mocker.patch("swh.lister.sourceforge.tasks.SourceForgeLister") + mock_lister.from_configfile.return_value = mock_lister + mock_lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task( + "swh.lister.sourceforge.tasks.FullSourceForgeLister" + ) + assert res + res.wait() + assert res.successful() + + mock_lister.from_configfile.assert_called_once() + mock_lister.run.assert_called_once() + assert res.result == stats.dict()