diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py index f550003..edc4e0e 100644 --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -1,200 +1,200 @@ # Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass from datetime import datetime import logging import random from typing import Any, Dict, Iterator, List, Optional from urllib import parse import iso8601 import requests from tenacity.before_sleep import before_sleep_log from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) @dataclass class BitbucketListerState: """State of Bitbucket lister""" last_repo_cdate: Optional[datetime] = None """Creation date and time of the last listed repository during an incremental pass""" class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]): - """List origins from Bitbucket using its REST API. + """List origins from Bitbucket using its API. Bitbucket API has the following rate-limit configuration: * 60 requests per hour for anonymous users * 1000 requests per hour for authenticated users The lister is working in anonymous mode by default but Bitbucket account credentials can be provided to perform authenticated requests. """ LISTER_NAME = "bitbucket" INSTANCE = "bitbucket" API_URL = "https://api.bitbucket.org/2.0/repositories" def __init__( self, scheduler: SchedulerInterface, page_size: int = 1000, incremental: bool = True, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.API_URL, instance=self.INSTANCE, ) self.incremental = incremental self.url_params = { "pagelen": page_size, # only return needed JSON fields in bitbucket API responses # (also prevent errors 500 when listing) "fields": ( "next,values.links.clone.href,values.scm,values.updated_on," "values.created_on" ), } self.session = requests.Session() self.session.headers.update( {"Accept": "application/json", "User-Agent": USER_AGENT} ) if len(self.credentials) > 0: cred = random.choice(self.credentials) logger.warning("Using Bitbucket credentials from user %s", cred["username"]) self.set_credentials(cred["username"], cred["password"]) else: logger.warning("No credentials set in configuration, using anonymous mode") def state_from_dict(self, d: Dict[str, Any]) -> BitbucketListerState: last_repo_cdate = d.get("last_repo_cdate") if last_repo_cdate is not None: d["last_repo_cdate"] = iso8601.parse_date(last_repo_cdate) return BitbucketListerState(**d) def state_to_dict(self, state: BitbucketListerState) -> Dict[str, Any]: d = asdict(state) last_repo_cdate = d.get("last_repo_cdate") if last_repo_cdate is not None: d["last_repo_cdate"] = last_repo_cdate.isoformat() return d def set_credentials(self, username: Optional[str], password: Optional[str]) -> None: """Set basic authentication headers with given credentials.""" if username is not None and password is not None: self.session.auth = (username, password) @throttling_retry(before_sleep=before_sleep_log(logger, logging.DEBUG)) def page_request(self, last_repo_cdate: str) -> requests.Response: self.url_params["after"] = last_repo_cdate logger.debug("Fetching URL %s with params %s", self.url, self.url_params) response = self.session.get(self.url, params=self.url_params) if response.status_code != 200: logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) response.raise_for_status() return response def get_pages(self) -> Iterator[List[Dict[str, Any]]]: last_repo_cdate: str = "1970-01-01" if ( self.incremental and self.state is not None and self.state.last_repo_cdate is not None ): last_repo_cdate = self.state.last_repo_cdate.isoformat() while True: body = self.page_request(last_repo_cdate).json() yield body["values"] next_page_url = body.get("next") if next_page_url is not None: next_page_url = parse.urlparse(next_page_url) if not next_page_url.query: logger.warning("Failed to parse url %s", next_page_url) break last_repo_cdate = parse.parse_qs(next_page_url.query)["after"][0] else: # last page break def get_origins_from_page( self, page: List[Dict[str, Any]] ) -> Iterator[ListedOrigin]: """Convert a page of Bitbucket repositories into a list of ListedOrigins. """ assert self.lister_obj.id is not None for repo in page: last_update = iso8601.parse_date(repo["updated_on"]) origin_url = repo["links"]["clone"][0]["href"] origin_type = repo["scm"] yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type=origin_type, last_update=last_update, ) def commit_page(self, page: List[Dict[str, Any]]) -> None: """Update the currently stored state using the latest listed page.""" if self.incremental: last_repo = page[-1] last_repo_cdate = iso8601.parse_date(last_repo["created_on"]) if ( self.state.last_repo_cdate is None or last_repo_cdate > self.state.last_repo_cdate ): self.state.last_repo_cdate = last_repo_cdate def finalize(self) -> None: if self.incremental: scheduler_state = self.get_state_from_scheduler() if self.state.last_repo_cdate is None: return # Update the lister state in the backend only if the last seen id of # the current run is higher than that stored in the database. if ( scheduler_state.last_repo_cdate is None or self.state.last_repo_cdate > scheduler_state.last_repo_cdate ): self.updated = True diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py index ae902b0..ec1db9f 100644 --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -1,224 +1,224 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass import datetime from enum import Enum import logging import re from typing import Iterator, List, Set from xml.etree import ElementTree import iso8601 import requests from tenacity.before_sleep import before_sleep_log from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT from ..pattern import StatelessLister logger = logging.getLogger(__name__) class VcsNames(Enum): """Used to filter SourceForge tool names for valid VCS types""" # CVS projects are read-only CVS = "cvs" GIT = "git" SUBVERSION = "svn" MERCURIAL = "hg" BAZAAR = "bzr" VCS_NAMES = set(v.value for v in VcsNames.__members__.values()) @dataclass class SourceForgeListerEntry: vcs: VcsNames url: str last_modified: datetime.date SourceForgeListerPage = List[SourceForgeListerEntry] MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml" SITEMAP_XML_NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}" -# REST resource endpoint for information about the given project. +# API resource endpoint for information about the given project. # # `namespace`: Project namespace. Very often `p`, but can be something else like # `adobe` # `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`. -PROJECT_REST_URL_FORMAT = "https://sourceforge.net/rest/{namespace}/{project}" +PROJECT_API_URL_FORMAT = "https://sourceforge.net/rest/{namespace}/{project}" # Predictable URL for cloning (in the broad sense) a VCS registered for the project. # # `vcs`: VCS type, one of `VCS_NAMES` # `namespace`: Project namespace. Very often `p`, but can be something else like # `adobe`. # `project`: Project name, e.g. `seedai`. Can be a subproject, e.g `backapps/website`. # `mount_point`: url path used by the repo. For example, the Code::Blocks project uses # `git` (https://git.code.sf.net/p/codeblocks/git). CLONE_URL_FORMAT = "{vcs}.code.sf.net/{namespace}/{project}/{mount_point}" PROJ_URL_RE = re.compile( r"^https://sourceforge.net/(?P[^/]+)/(?P[^/]+)/(?P.*)?" ) class SourceForgeLister(StatelessLister[SourceForgeListerPage]): """List origins from the "SourceForge" forge. """ # Part of the lister API, that identifies this lister LISTER_NAME = "sourceforge" def __init__(self, scheduler: SchedulerInterface): super().__init__( scheduler=scheduler, url="https://sourceforge.net", instance="main" ) self.session = requests.Session() # Declare the USER_AGENT is more sysadm-friendly for the forge we list self.session.headers.update( {"Accept": "application/json", "User-Agent": USER_AGENT} ) @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url, params) -> requests.Response: # Log listed URL to ease debugging logger.debug("Fetching URL %s with params %s", url, params) response = self.session.get(url, params=params) if response.status_code != 200: # Log response content to ease debugging logger.warning( "Unexpected HTTP status code %s on %s: %s", response.status_code, response.url, response.content, ) # The lister must fail on blocking errors response.raise_for_status() return response def get_pages(self) -> Iterator[SourceForgeListerPage]: """ SourceForge has a main XML sitemap that lists its sharded sitemaps for all projects. Each XML sub-sitemap lists project pages, which are not unique per project: a project can have a wiki, a home, a git, an svn, etc. - For each unique project, we query a REST endpoint that lists (among + For each unique project, we query an API endpoint that lists (among other things) the tools associated with said project, some of which are the VCS used. Subprojects are considered separate projects. Lastly we use the information of which VCS are used to build the predictable clone URL for any given VCS. """ sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text tree = ElementTree.fromstring(sitemap_contents) for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"): # TODO use when adding incremental support # last_modified = sub_sitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod") location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc") assert location is not None sub_url = location.text subsitemap_contents = self.page_request(sub_url, {}).text subtree = ElementTree.fromstring(subsitemap_contents) yield from self._get_pages_from_subsitemap(subtree) def get_origins_from_page( self, page: SourceForgeListerPage ) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None for hit in page: yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=hit.vcs.value, url=hit.url, last_update=iso8601.parse_date(hit.last_modified), ) def _get_pages_from_subsitemap( self, subtree: ElementTree.Element ) -> Iterator[SourceForgeListerPage]: projects: Set[str] = set() for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"): last_modified_block = project_block.find(f"{SITEMAP_XML_NAMESPACE}lastmod") assert last_modified_block is not None last_modified = last_modified_block.text location = project_block.find(f"{SITEMAP_XML_NAMESPACE}loc") assert location is not None project_url = location.text assert project_url is not None match = PROJ_URL_RE.match(project_url) if match: matches = match.groupdict() namespace = matches["namespace"] if namespace == "projects": # These have a `p`-namespaced counterpart, use that instead continue project = matches["project"] rest = matches["rest"] if rest.count("/") > 1: # This is a subproject. There exists no sub-subprojects. subproject_name = rest.rsplit("/", 2)[0] project = f"{project}/{subproject_name}" prev_len = len(projects) projects.add(project) if prev_len == len(projects): # Already seen continue pages = self._get_pages_for_project(namespace, project, last_modified) if pages: yield pages else: logger.debug("Project '%s' does not have any VCS", project) else: # Should always match, let's log it msg = "Project URL '%s' does not match expected pattern" logger.warning(msg, project_url) def _get_pages_for_project( self, namespace, project, last_modified ) -> SourceForgeListerPage: - endpoint = PROJECT_REST_URL_FORMAT.format(namespace=namespace, project=project) + endpoint = PROJECT_API_URL_FORMAT.format(namespace=namespace, project=project) res = self.page_request(endpoint, {}).json() tools = res.get("tools") if tools is None: # This probably never happens logger.warning("Project '%s' does not have any tools", endpoint) return [] hits = [] for tool in tools: tool_name = tool["name"] if tool_name not in VCS_NAMES: continue url = CLONE_URL_FORMAT.format( vcs=tool_name, namespace=namespace, project=project, mount_point=tool["mount_point"], ) entry = SourceForgeListerEntry( vcs=VcsNames(tool_name), url=url, last_modified=last_modified ) hits.append(entry) return hits diff --git a/swh/lister/sourceforge/tests/test_lister.py b/swh/lister/sourceforge/tests/test_lister.py index 39bc163..0c1f226 100644 --- a/swh/lister/sourceforge/tests/test_lister.py +++ b/swh/lister/sourceforge/tests/test_lister.py @@ -1,180 +1,180 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import functools import json from pathlib import Path import re import pytest from requests.exceptions import HTTPError from swh.lister import USER_AGENT from swh.lister.sourceforge.lister import ( MAIN_SITEMAP_URL, - PROJECT_REST_URL_FORMAT, + PROJECT_API_URL_FORMAT, SourceForgeLister, ) # Mapping of project name to namespace TEST_PROJECTS = { "adobexmp": "adobe", "backapps": "p", "backapps/website": "p", "mojunk": "p", "mramm": "p", "os3dmodels": "p", } URLS_MATCHER = { - PROJECT_REST_URL_FORMAT.format(namespace=namespace, project=project): project + PROJECT_API_URL_FORMAT.format(namespace=namespace, project=project): project for project, namespace in TEST_PROJECTS.items() } def get_main_sitemap(datadir): return Path(datadir, "main-sitemap.xml").read_text() def get_subsitemap_0(datadir): return Path(datadir, "subsitemap-0.xml").read_text() def get_subsitemap_1(datadir): return Path(datadir, "subsitemap-1.xml").read_text() def get_project_json(datadir, request, context): url = request.url project = URLS_MATCHER.get(url) assert project is not None, f"Url '{url}' could not be matched" project = project.replace("/", "-") return json.loads(Path(datadir, f"{project}.json").read_text()) def _check_request_headers(request): return request.headers.get("User-Agent") == USER_AGENT def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir): """ Simulate a full listing of an artificially restricted sourceforge. There are 5 different projects, spread over two sub-sitemaps, a few of which have multiple VCS listed, one has none, one is outside of the standard `/p/` namespace, some with custom mount points. All non-interesting but related entries have been kept. """ lister = SourceForgeLister(scheduler=swh_scheduler) requests_mock.get( MAIN_SITEMAP_URL, text=get_main_sitemap(datadir), additional_matcher=_check_request_headers, ) requests_mock.get( "https://sourceforge.net/allura_sitemap/sitemap-0.xml", text=get_subsitemap_0(datadir), additional_matcher=_check_request_headers, ) requests_mock.get( "https://sourceforge.net/allura_sitemap/sitemap-1.xml", text=get_subsitemap_1(datadir), additional_matcher=_check_request_headers, ) requests_mock.get( re.compile("https://sourceforge.net/rest/.*"), json=functools.partial(get_project_json, datadir), additional_matcher=_check_request_headers, ) stats = lister.run() # - os3dmodels (2 repos), # - mramm (3 repos), # - mojunk (3 repos), # - backapps/website (1 repo). # adobe and backapps itself have no repos. assert stats.pages == 4 assert stats.origins == 9 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins} assert res == { "svn.code.sf.net/p/backapps/website/code": ("svn", "2021-02-11"), "git.code.sf.net/p/os3dmodels/git": ("git", "2017-03-31"), "svn.code.sf.net/p/os3dmodels/svn": ("svn", "2017-03-31"), "git.code.sf.net/p/mramm/files": ("git", "2019-04-04"), "git.code.sf.net/p/mramm/git": ("git", "2019-04-04"), "svn.code.sf.net/p/mramm/svn": ("svn", "2019-04-04"), "git.code.sf.net/p/mojunk/git": ("git", "2017-12-31"), "git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"), "svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"), } def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir): # Exponential retries take a long time, so stub time.sleep mocked_sleep = mocker.patch("time.sleep", return_value=None) lister = SourceForgeLister(scheduler=swh_scheduler) requests_mock.get( MAIN_SITEMAP_URL, [ {"status_code": 429}, {"status_code": 429}, {"text": get_main_sitemap(datadir)}, ], additional_matcher=_check_request_headers, ) requests_mock.get( "https://sourceforge.net/allura_sitemap/sitemap-0.xml", [{"status_code": 429}, {"text": get_subsitemap_0(datadir), "status_code": 301}], additional_matcher=_check_request_headers, ) requests_mock.get( "https://sourceforge.net/allura_sitemap/sitemap-1.xml", [{"status_code": 429}, {"text": get_subsitemap_1(datadir)}], additional_matcher=_check_request_headers, ) requests_mock.get( re.compile("https://sourceforge.net/rest/.*"), [{"status_code": 429}, {"json": functools.partial(get_project_json, datadir)}], additional_matcher=_check_request_headers, ) stats = lister.run() # - os3dmodels (2 repos), # - mramm (3 repos), # - mojunk (3 repos), # - backapps/website (1 repo). # adobe and backapps itself have no repos. assert stats.pages == 4 assert stats.origins == 9 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert {o.url: o.visit_type for o in scheduler_origins} == { "svn.code.sf.net/p/backapps/website/code": "svn", "git.code.sf.net/p/os3dmodels/git": "git", "svn.code.sf.net/p/os3dmodels/svn": "svn", "git.code.sf.net/p/mramm/files": "git", "git.code.sf.net/p/mramm/git": "git", "svn.code.sf.net/p/mramm/svn": "svn", "git.code.sf.net/p/mojunk/git": "git", "git.code.sf.net/p/mojunk/git2": "git", "svn.code.sf.net/p/mojunk/svn": "svn", } # Test `time.sleep` is called with exponential retries calls = [1.0, 10.0, 1.0, 1.0] mocked_sleep.assert_has_calls([mocker.call(c) for c in calls]) @pytest.mark.parametrize("status_code", [500, 503, 504, 403, 404]) def test_sourceforge_lister_http_error(swh_scheduler, requests_mock, status_code): lister = SourceForgeLister(scheduler=swh_scheduler) requests_mock.get(MAIN_SITEMAP_URL, status_code=status_code) with pytest.raises(HTTPError): lister.run()