diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -2,24 +2,25 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from dataclasses import dataclass +from dataclasses import dataclass, field import datetime from enum import Enum import logging import re -from typing import Iterator, List, Set +from typing import Any, Dict, Iterator, List, Optional, Set, Tuple from xml.etree import ElementTree import iso8601 import requests from tenacity.before_sleep import before_sleep_log +from swh.core.api.classes import stream_results from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from .. import USER_AGENT -from ..pattern import StatelessLister +from ..pattern import Lister logger = logging.getLogger(__name__) @@ -45,6 +46,29 @@ last_modified: datetime.date +SubSitemapName = str +ProjectName = str +LastModified = datetime.date + + +@dataclass +class SourceForgeListerState: + """Current state of the SourceForge lister in incremental runs + """ + + """If the subsitemap does not exist, we assume a full run of this subsitemap + is needed. If the date is the same, we skip the subsitemap, otherwise we + request the subsitemap and look up every project's "last modified" date + to compare against `ListedOrigins` from the database.""" + subsitemap_last_modified: Dict[SubSitemapName, LastModified] = field( + default_factory=dict + ) + """Some projects (not the majority, but still meaningful) have no VCS for us to + archive. We need to remember a mapping of their API URL to their "last modified" + date so we don't keep querying them needlessly every time.""" + empty_projects: Dict[str, LastModified] = field(default_factory=dict) + + SourceForgeListerPage = List[SourceForgeListerEntry] MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml" @@ -71,8 +95,11 @@ r"^https://sourceforge.net/(?P[^/]+)/(?P[^/]+)/(?P.*)?" ) +# Mapping of `(namespace, project name)` to `last modified` date. +ProjectsLastModifiedCache = Dict[Tuple[str, str], LastModified] + -class SourceForgeLister(StatelessLister[SourceForgeListerPage]): +class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]): """List origins from the "SourceForge" forge. """ @@ -80,16 +107,75 @@ # Part of the lister API, that identifies this lister LISTER_NAME = "sourceforge" - def __init__(self, scheduler: SchedulerInterface): + def __init__(self, scheduler: SchedulerInterface, incremental: bool = False): super().__init__( scheduler=scheduler, url="https://sourceforge.net", instance="main" ) + # Will hold the currently saved "last modified" dates to compare against our + # requests. + self._project_last_modified: Optional[ProjectsLastModifiedCache] = None self.session = requests.Session() # Declare the USER_AGENT is more sysadm-friendly for the forge we list self.session.headers.update( {"Accept": "application/json", "User-Agent": USER_AGENT} ) + self.incremental = incremental + + def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState: + subsitemaps = { + k: datetime.date.fromisoformat(v) + for k, v in d.get("subsitemap_last_modified", {}).items() + } + empty_projects = { + k: datetime.date.fromisoformat(v) + for k, v in d.get("empty_projects", {}).items() + } + return SourceForgeListerState( + subsitemap_last_modified=subsitemaps, empty_projects=empty_projects + ) + + def state_to_dict(self, state: SourceForgeListerState) -> Dict[str, Any]: + return { + "subsitemap_last_modified": { + k: v.isoformat() for k, v in state.subsitemap_last_modified.items() + }, + "empty_projects": { + k: v.isoformat() for k, v in state.empty_projects.items() + }, + } + + def projects_last_modified(self) -> ProjectsLastModifiedCache: + if not self.incremental: + # No point in loading the previous results if we're doing a full run + return {} + if self._project_last_modified is not None: + return self._project_last_modified + # We know there will be at least that many origins + stream = stream_results( + self.scheduler.get_listed_origins, self.lister_obj.id, limit=300_000 + ) + listed_origins = dict() + # Projects can have slashes in them if they're subprojects, but the + # mointpoint (last component) cannot. + url_match = re.compile( + r".*\.code\.sf\.net/(?P[^/]+)/(?P.+)/.*" + ) + for origin in stream: + url = origin.url + match = url_match.match(url) + assert match is not None + matches = match.groupdict() + namespace = matches["namespace"] + project = matches["project"] + # "Last modified" dates are the same across all VCS (tools, even) + # within a project or subproject. An assertion here would be overkill. + last_modified = origin.last_update + assert last_modified is not None + listed_origins[(namespace, project)] = last_modified.date() + + self._project_last_modified = listed_origins + return listed_origins @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING)) def page_request(self, url, params) -> requests.Response: @@ -126,11 +212,21 @@ tree = ElementTree.fromstring(sitemap_contents) for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"): - # TODO use when adding incremental support - # last_modified = sub_sitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod") + last_modified_el = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod") + assert last_modified_el is not None and last_modified_el.text is not None + last_modified = datetime.date.fromisoformat(last_modified_el.text) location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc") - assert location is not None + assert location is not None and location.text is not None sub_url = location.text + + if self.incremental: + recorded_last_mod = self.state.subsitemap_last_modified.get(sub_url) + if recorded_last_mod == last_modified: + # The entire subsitemap hasn't changed, so none of its projects + # have either, skip it. + continue + + self.state.subsitemap_last_modified[sub_url] = last_modified subsitemap_contents = self.page_request(sub_url, {}).text subtree = ElementTree.fromstring(subsitemap_contents) @@ -151,7 +247,7 @@ def _get_pages_from_subsitemap( self, subtree: ElementTree.Element ) -> Iterator[SourceForgeListerPage]: - projects: Set[str] = set() + projects: Set[ProjectName] = set() for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"): last_modified_block = project_block.find(f"{SITEMAP_XML_NAMESPACE}lastmod") assert last_modified_block is not None @@ -197,6 +293,28 @@ self, namespace, project, last_modified ) -> SourceForgeListerPage: endpoint = PROJECT_API_URL_FORMAT.format(namespace=namespace, project=project) + empty_project_last_modified = self.state.empty_projects.get(endpoint) + if empty_project_last_modified is not None: + if last_modified == empty_project_last_modified.isoformat(): + # Project has not changed, so is still empty, meaning it has + # no VCS attached that we can archive. + logger.debug(f"Project {namespace}/{project} is still empty") + return [] + + if self.incremental: + expected = self.projects_last_modified().get((namespace, project)) + + if expected is not None: + if expected.isoformat() == last_modified: + # Project has not changed + logger.debug(f"Project {namespace}/{project} has not changed") + return [] + else: + logger.debug(f"Project {namespace}/{project} was updated") + else: + msg = "New project during an incremental run: %s/%s" + logger.debug(msg, namespace, project) + res = self.page_request(endpoint, {}).json() tools = res.get("tools") @@ -221,4 +339,10 @@ ) hits.append(entry) + if not hits: + date = datetime.date.fromisoformat(last_modified) + self.state.empty_projects[endpoint] = date + else: + self.state.empty_projects.pop(endpoint, None) + return hits diff --git a/swh/lister/sourceforge/tests/data/subsitemap-0.xml b/swh/lister/sourceforge/tests/data/subsitemap-0.xml --- a/swh/lister/sourceforge/tests/data/subsitemap-0.xml +++ b/swh/lister/sourceforge/tests/data/subsitemap-0.xml @@ -55,4 +55,15 @@ 2017-10-17 daily + + + https://sourceforge.net/projects/backapps/files/ + 2021-02-11 + daily + + + https://sourceforge.net/p/backapps/tickets/ + 2021-02-11 + daily + diff --git a/swh/lister/sourceforge/tests/test_lister.py b/swh/lister/sourceforge/tests/test_lister.py --- a/swh/lister/sourceforge/tests/test_lister.py +++ b/swh/lister/sourceforge/tests/test_lister.py @@ -2,11 +2,13 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import datetime import functools import json from pathlib import Path import re +from iso8601 import iso8601 import pytest from requests.exceptions import HTTPError @@ -15,9 +17,12 @@ MAIN_SITEMAP_URL, PROJECT_API_URL_FORMAT, SourceForgeLister, + SourceForgeListerState, ) # Mapping of project name to namespace +from swh.scheduler.model import ListedOrigin + TEST_PROJECTS = { "adobexmp": "adobe", "backapps": "p", @@ -57,6 +62,22 @@ return request.headers.get("User-Agent") == USER_AGENT +def _check_listed_origins(lister, swh_scheduler): + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins} + assert res == { + "svn.code.sf.net/p/backapps/website/code": ("svn", "2021-02-11"), + "git.code.sf.net/p/os3dmodels/git": ("git", "2017-03-31"), + "svn.code.sf.net/p/os3dmodels/svn": ("svn", "2017-03-31"), + "git.code.sf.net/p/mramm/files": ("git", "2019-04-04"), + "git.code.sf.net/p/mramm/git": ("git", "2019-04-04"), + "svn.code.sf.net/p/mramm/svn": ("svn", "2019-04-04"), + "git.code.sf.net/p/mojunk/git": ("git", "2017-12-31"), + "git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"), + "svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"), + } + + def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir): """ Simulate a full listing of an artificially restricted sourceforge. @@ -96,20 +117,157 @@ # adobe and backapps itself have no repos. assert stats.pages == 4 assert stats.origins == 9 + expected_state = { + "subsitemap_last_modified": { + "https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18", + "https://sourceforge.net/allura_sitemap/sitemap-1.xml": "2021-03-18", + }, + "empty_projects": { + "https://sourceforge.net/rest/p/backapps": "2021-02-11", + "https://sourceforge.net/rest/adobe/adobexmp": "2017-10-17", + }, + } + assert lister.state_to_dict(lister.state) == expected_state - scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins} - assert res == { - "svn.code.sf.net/p/backapps/website/code": ("svn", "2021-02-11"), - "git.code.sf.net/p/os3dmodels/git": ("git", "2017-03-31"), - "svn.code.sf.net/p/os3dmodels/svn": ("svn", "2017-03-31"), - "git.code.sf.net/p/mramm/files": ("git", "2019-04-04"), - "git.code.sf.net/p/mramm/git": ("git", "2019-04-04"), - "svn.code.sf.net/p/mramm/svn": ("svn", "2019-04-04"), - "git.code.sf.net/p/mojunk/git": ("git", "2017-12-31"), - "git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"), - "svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"), + _check_listed_origins(lister, swh_scheduler) + + +def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, mocker): + """ + Simulate an incremental listing of an artificially restricted sourceforge. + Same dataset as the full run, because it's enough to validate the different cases. + """ + lister = SourceForgeLister(scheduler=swh_scheduler, incremental=True) + + requests_mock.get( + MAIN_SITEMAP_URL, + text=get_main_sitemap(datadir), + additional_matcher=_check_request_headers, + ) + + def not_called(request, *args, **kwargs): + raise AssertionError(f"Should not have been called: '{request.url}'") + + requests_mock.get( + "https://sourceforge.net/allura_sitemap/sitemap-0.xml", + text=get_subsitemap_0(datadir), + additional_matcher=_check_request_headers, + ) + requests_mock.get( + "https://sourceforge.net/allura_sitemap/sitemap-1.xml", + text=not_called, + additional_matcher=_check_request_headers, + ) + + def filtered_get_project_json(request, context): + # These projects should not be requested again + assert URLS_MATCHER[request.url] not in {"adobe", "mojunk"} + return get_project_json(datadir, request, context) + + requests_mock.get( + re.compile("https://sourceforge.net/rest/.*"), + json=filtered_get_project_json, + additional_matcher=_check_request_headers, + ) + + faked_listed_origins = [ + # mramm: changed + ListedOrigin( + lister_id=lister.lister_obj.id, + visit_type="git", + url="git.code.sf.net/p/mramm/files", + last_update=iso8601.parse_date("2019-01-01"), + ), + ListedOrigin( + lister_id=lister.lister_obj.id, + visit_type="git", + url="git.code.sf.net/p/mramm/git", + last_update=iso8601.parse_date("2019-01-01"), + ), + ListedOrigin( + lister_id=lister.lister_obj.id, + visit_type="svn", + url="svn.code.sf.net/p/mramm/svn", + last_update=iso8601.parse_date("2019-01-01"), + ), + # stayed the same, even though its subsitemap has changed + ListedOrigin( + lister_id=lister.lister_obj.id, + visit_type="git", + url="git.code.sf.net/p/os3dmodels/git", + last_update=iso8601.parse_date("2017-03-31"), + ), + ListedOrigin( + lister_id=lister.lister_obj.id, + visit_type="svn", + url="svn.code.sf.net/p/os3dmodels/svn", + last_update=iso8601.parse_date("2017-03-31"), + ), + # others: stayed the same, should be skipped + ListedOrigin( + lister_id=lister.lister_obj.id, + visit_type="git", + url="git.code.sf.net/p/mojunk/git", + last_update=iso8601.parse_date("2017-12-31"), + ), + ListedOrigin( + lister_id=lister.lister_obj.id, + visit_type="git", + url="git.code.sf.net/p/mojunk/git2", + last_update=iso8601.parse_date("2017-12-31"), + ), + ListedOrigin( + lister_id=lister.lister_obj.id, + visit_type="svn", + url="svn.code.sf.net/p/mojunk/svn", + last_update=iso8601.parse_date("2017-12-31"), + ), + ListedOrigin( + lister_id=lister.lister_obj.id, + visit_type="svn", + url="svn.code.sf.net/p/backapps/website/code", + last_update=iso8601.parse_date("2021-02-11"), + ), + ] + swh_scheduler.record_listed_origins(faked_listed_origins) + + to_date = datetime.date.fromisoformat + faked_state = SourceForgeListerState( + subsitemap_last_modified={ + # changed + "https://sourceforge.net/allura_sitemap/sitemap-0.xml": to_date( + "2021-02-18" + ), + # stayed the same + "https://sourceforge.net/allura_sitemap/sitemap-1.xml": to_date( + "2021-03-18" + ), + }, + empty_projects={ + "https://sourceforge.net/rest/p/backapps": to_date("2020-02-11"), + "https://sourceforge.net/rest/adobe/adobexmp": to_date("2017-10-17"), + }, + ) + lister.state = faked_state + + stats = lister.run() + # - mramm (3 repos), # changed + assert stats.pages == 1 + assert stats.origins == 3 + expected_state = { + "subsitemap_last_modified": { + "https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18", + "https://sourceforge.net/allura_sitemap/sitemap-1.xml": "2021-03-18", + }, + "empty_projects": { + "https://sourceforge.net/rest/p/backapps": "2021-02-11", # changed + "https://sourceforge.net/rest/adobe/adobexmp": "2017-10-17", + }, } + assert lister.state_to_dict(lister.state) == expected_state + + # origins have been updated + _check_listed_origins(lister, swh_scheduler) def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir):