diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py
--- a/swh/lister/sourceforge/lister.py
+++ b/swh/lister/sourceforge/lister.py
@@ -2,24 +2,25 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from dataclasses import dataclass
+from dataclasses import dataclass, field
import datetime
from enum import Enum
import logging
import re
-from typing import Iterator, List, Set
+from typing import Any, Dict, Iterator, List, Optional, Set
from xml.etree import ElementTree
import iso8601
import requests
from tenacity.before_sleep import before_sleep_log
+from swh.core.api.classes import stream_results
from swh.lister.utils import throttling_retry
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from .. import USER_AGENT
-from ..pattern import StatelessLister
+from ..pattern import Lister
logger = logging.getLogger(__name__)
@@ -45,6 +46,28 @@
last_modified: datetime.date
+SubSitemapName = str
+ProjectName = str
+LastModified = datetime.date
+
+
+@dataclass
+class SourceForgeListerState:
+ """Current state of the SourceForge lister in incremental runs"""
+
+ # If the subsitemap does not exist, we assume a full run of this subsitemap
+ # is needed. If the date is the same, we skip the subsitemap, otherwise we
+ # request the subsitemap and look up every project's "last modified" date
+ # to compare against `ListedOrigins` from the database.
+ subsitemap_last_modified: Dict[SubSitemapName, LastModified] = field(
+ default_factory=dict
+ )
+ # Some projects (not the majority, but still meaningful) have VCS for us to
+ # archive. We need to remember their "last modified" date so we don't keep
+ # querying them uselessly every time.
+ empty_projects: Dict[ProjectName, LastModified] = field(default_factory=dict)
+
+
SourceForgeListerPage = List[SourceForgeListerEntry]
MAIN_SITEMAP_URL = "https://sourceforge.net/allura_sitemap/sitemap.xml"
@@ -72,7 +95,7 @@
)
-class SourceForgeLister(StatelessLister[SourceForgeListerPage]):
+class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
"""List origins from the "SourceForge" forge.
"""
@@ -80,16 +103,57 @@
# Part of the lister API, that identifies this lister
LISTER_NAME = "sourceforge"
- def __init__(self, scheduler: SchedulerInterface):
+ def __init__(self, scheduler: SchedulerInterface, incremental: bool = False):
super().__init__(
scheduler=scheduler, url="https://sourceforge.net", instance="main"
)
+ # Will hold the currently saved listed origins to compare against our
+ # requests.
+ self._listed_origins: Optional[List[ListedOrigin]] = None
self.session = requests.Session()
# Declare the USER_AGENT is more sysadm-friendly for the forge we list
self.session.headers.update(
{"Accept": "application/json", "User-Agent": USER_AGENT}
)
+ self.incremental = incremental
+
+ def state_from_dict(self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState:
+ subsitemaps = {
+ k: datetime.date.fromisoformat(v)
+ for k, v in d.get("subsitemap_last_modified", {}).items()
+ }
+ empty_projects = {
+ k: datetime.date.fromisoformat(v)
+ for k, v in d.get("empty_projects", {}).items()
+ }
+ return SourceForgeListerState(
+ subsitemap_last_modified=subsitemaps, empty_projects=empty_projects
+ )
+
+ def state_to_dict(self, state: SourceForgeListerState) -> Dict[str, Any]:
+ return {
+ "subsitemap_last_modified": {
+ k: v.isoformat() for k, v in state.subsitemap_last_modified.items()
+ },
+ "empty_projects": {
+ k: v.isoformat() for k, v in state.empty_projects.items()
+ },
+ }
+
+ def listed_origins(self) -> List[ListedOrigin]:
+ if not self.incremental:
+ # No point in loading the previous results if we're doing a full run
+ return []
+ if self._listed_origins is not None:
+ return self._listed_origins
+ # We know there will be at least that many origins
+ stream = stream_results(
+ self.scheduler.get_listed_origins, self.lister_obj.id, limit=300_000
+ )
+ # Sort the results to help branch prediction when looking for urls
+ self._listed_origins = list(sorted(stream, key=lambda o: o.url))
+ return self._listed_origins
@throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
def page_request(self, url, params) -> requests.Response:
@@ -126,11 +190,20 @@
tree = ElementTree.fromstring(sitemap_contents)
for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"):
- # TODO use when adding incremental support
- # last_modified = sub_sitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod")
+ last_modified_el = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}lastmod")
+ assert last_modified_el is not None and last_modified_el.text is not None
+ last_modified = datetime.date.fromisoformat(last_modified_el.text)
location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc")
- assert location is not None
+ assert location is not None and location.text is not None
sub_url = location.text
+
+ recorded_last_mod = self.state.subsitemap_last_modified.get(sub_url)
+ if recorded_last_mod == last_modified:
+ # The entire subsitemap hasn't changed, so none of its projects
+ # have either, skip it.
+ continue
+
+ self.state.subsitemap_last_modified[sub_url] = last_modified
subsitemap_contents = self.page_request(sub_url, {}).text
subtree = ElementTree.fromstring(subsitemap_contents)
@@ -151,7 +224,7 @@
def _get_pages_from_subsitemap(
self, subtree: ElementTree.Element
) -> Iterator[SourceForgeListerPage]:
- projects: Set[str] = set()
+ projects: Set[ProjectName] = set()
for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"):
last_modified_block = project_block.find(f"{SITEMAP_XML_NAMESPACE}lastmod")
assert last_modified_block is not None
@@ -197,6 +270,30 @@
self, namespace, project, last_modified
) -> SourceForgeListerPage:
endpoint = PROJECT_API_URL_FORMAT.format(namespace=namespace, project=project)
+ empty_project_last_modified = self.state.empty_projects.get(endpoint)
+ if empty_project_last_modified is not None:
+ if last_modified == empty_project_last_modified.isoformat():
+ # Project has not changed, so is still empty
+ logger.debug(f"Project {namespace}/{project} is still empty")
+ return []
+
+ url_match = f".code.sf.net/{namespace}/{project}/"
+ origins = [o for o in self.listed_origins() if url_match in o.url]
+
+ if origins:
+ expected_last_modified = origins[0].last_update
+ # Sanity check
+ assert all(o.last_update == expected_last_modified for o in origins)
+ if expected_last_modified == last_modified:
+ # Project has not changed
+ logger.debug(f"Project {namespace}/{project} has not changed")
+ return []
+ else:
+ logger.debug(f"Project {namespace}/{project} was updated")
+ elif self.incremental:
+ msg = "Project %s/%s has no listed origins, but is not stored as empty"
+ logger.warning(msg, namespace, project)
+
res = self.page_request(endpoint, {}).json()
tools = res.get("tools")
@@ -221,4 +318,8 @@
)
hits.append(entry)
+ if not hits:
+ date = datetime.date.fromisoformat(last_modified)
+ self.state.empty_projects[endpoint] = date
+
return hits
diff --git a/swh/lister/sourceforge/tests/data/subsitemap-0.xml b/swh/lister/sourceforge/tests/data/subsitemap-0.xml
--- a/swh/lister/sourceforge/tests/data/subsitemap-0.xml
+++ b/swh/lister/sourceforge/tests/data/subsitemap-0.xml
@@ -55,4 +55,15 @@
2017-10-17
daily
+
+
+ https://sourceforge.net/projects/backapps/files/
+ 2021-02-11
+ daily
+
+
+ https://sourceforge.net/p/backapps/tickets/
+ 2021-02-11
+ daily
+
diff --git a/swh/lister/sourceforge/tests/test_lister.py b/swh/lister/sourceforge/tests/test_lister.py
--- a/swh/lister/sourceforge/tests/test_lister.py
+++ b/swh/lister/sourceforge/tests/test_lister.py
@@ -2,11 +2,13 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import datetime
import functools
import json
from pathlib import Path
import re
+from iso8601 import iso8601
import pytest
from requests.exceptions import HTTPError
@@ -15,9 +17,12 @@
MAIN_SITEMAP_URL,
PROJECT_API_URL_FORMAT,
SourceForgeLister,
+ SourceForgeListerState,
)
# Mapping of project name to namespace
+from swh.scheduler.model import ListedOrigin
+
TEST_PROJECTS = {
"adobexmp": "adobe",
"backapps": "p",
@@ -57,6 +62,22 @@
return request.headers.get("User-Agent") == USER_AGENT
+def _check_listed_origins(lister, swh_scheduler):
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+ res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins}
+ assert res == {
+ "svn.code.sf.net/p/backapps/website/code": ("svn", "2021-02-11"),
+ "git.code.sf.net/p/os3dmodels/git": ("git", "2017-03-31"),
+ "svn.code.sf.net/p/os3dmodels/svn": ("svn", "2017-03-31"),
+ "git.code.sf.net/p/mramm/files": ("git", "2019-04-04"),
+ "git.code.sf.net/p/mramm/git": ("git", "2019-04-04"),
+ "svn.code.sf.net/p/mramm/svn": ("svn", "2019-04-04"),
+ "git.code.sf.net/p/mojunk/git": ("git", "2017-12-31"),
+ "git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"),
+ "svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"),
+ }
+
+
def test_sourceforge_lister_full(swh_scheduler, requests_mock, datadir):
"""
Simulate a full listing of an artificially restricted sourceforge.
@@ -96,20 +117,158 @@
# adobe and backapps itself have no repos.
assert stats.pages == 4
assert stats.origins == 9
+ expected_state = {
+ "subsitemap_last_modified": {
+ "https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18",
+ "https://sourceforge.net/allura_sitemap/sitemap-1.xml": "2021-03-18",
+ },
+ "empty_projects": {
+ "https://sourceforge.net/rest/p/backapps": "2021-02-11",
+ "https://sourceforge.net/rest/adobe/adobexmp": "2017-10-17",
+ },
+ }
+ assert lister.state_to_dict(lister.state) == expected_state
- scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
- res = {o.url: (o.visit_type, str(o.last_update.date())) for o in scheduler_origins}
- assert res == {
- "svn.code.sf.net/p/backapps/website/code": ("svn", "2021-02-11"),
- "git.code.sf.net/p/os3dmodels/git": ("git", "2017-03-31"),
- "svn.code.sf.net/p/os3dmodels/svn": ("svn", "2017-03-31"),
- "git.code.sf.net/p/mramm/files": ("git", "2019-04-04"),
- "git.code.sf.net/p/mramm/git": ("git", "2019-04-04"),
- "svn.code.sf.net/p/mramm/svn": ("svn", "2019-04-04"),
- "git.code.sf.net/p/mojunk/git": ("git", "2017-12-31"),
- "git.code.sf.net/p/mojunk/git2": ("git", "2017-12-31"),
- "svn.code.sf.net/p/mojunk/svn": ("svn", "2017-12-31"),
+ _check_listed_origins(lister, swh_scheduler)
+
+
+def test_sourceforge_lister_incremental(swh_scheduler, requests_mock, datadir, mocker):
+ """
+ Simulate an incremental listing of an artificially restricted sourceforge.
+ Same dataset as the full run, because it's enough to validate the different cases.
+ """
+ lister = SourceForgeLister(scheduler=swh_scheduler)
+
+ requests_mock.get(
+ MAIN_SITEMAP_URL,
+ text=get_main_sitemap(datadir),
+ additional_matcher=_check_request_headers,
+ )
+
+ def not_called(request, *args, **kwargs):
+ raise AssertionError(f"Should not have been called: '{request.url}'")
+
+ requests_mock.get(
+ "https://sourceforge.net/allura_sitemap/sitemap-0.xml",
+ text=get_subsitemap_0(datadir),
+ additional_matcher=_check_request_headers,
+ )
+ requests_mock.get(
+ "https://sourceforge.net/allura_sitemap/sitemap-1.xml",
+ text=not_called,
+ additional_matcher=_check_request_headers,
+ )
+
+ def filtered_get_project_json(request, context):
+ # These projects should not be requested again
+ assert URLS_MATCHER[request.url] not in {"adobe", "mojunk"}
+ return get_project_json(datadir, request, context)
+
+ requests_mock.get(
+ re.compile("https://sourceforge.net/rest/.*"),
+ json=filtered_get_project_json,
+ additional_matcher=_check_request_headers,
+ )
+
+ faked_listed_origins = [
+ # osd3models: changed
+ ListedOrigin(
+ lister_id=lister.lister_obj.id,
+ visit_type="git",
+ url="git.code.sf.net/p/os3dmodels/git",
+ last_update=iso8601.parse_date("2017-01-01"),
+ ),
+ ListedOrigin(
+ lister_id=lister.lister_obj.id,
+ visit_type="svn",
+ url="svn.code.sf.net/p/os3dmodels/svn",
+ last_update=iso8601.parse_date("2017-01-01"),
+ ),
+ # mramm: changed
+ ListedOrigin(
+ lister_id=lister.lister_obj.id,
+ visit_type="git",
+ url="git.code.sf.net/p/mramm/files",
+ last_update=iso8601.parse_date("2019-01-01"),
+ ),
+ ListedOrigin(
+ lister_id=lister.lister_obj.id,
+ visit_type="git",
+ url="git.code.sf.net/p/mramm/git",
+ last_update=iso8601.parse_date("2019-01-01"),
+ ),
+ ListedOrigin(
+ lister_id=lister.lister_obj.id,
+ visit_type="svn",
+ url="svn.code.sf.net/p/mramm/svn",
+ last_update=iso8601.parse_date("2019-01-01"),
+ ),
+ # others: stayed the same, should be skipped
+ ListedOrigin(
+ lister_id=lister.lister_obj.id,
+ visit_type="git",
+ url="git.code.sf.net/p/mojunk/git",
+ last_update=iso8601.parse_date("2017-12-31"),
+ ),
+ ListedOrigin(
+ lister_id=lister.lister_obj.id,
+ visit_type="git",
+ url="git.code.sf.net/p/mojunk/git2",
+ last_update=iso8601.parse_date("2017-12-31"),
+ ),
+ ListedOrigin(
+ lister_id=lister.lister_obj.id,
+ visit_type="svn",
+ url="svn.code.sf.net/p/mojunk/svn",
+ last_update=iso8601.parse_date("2017-12-31"),
+ ),
+ ListedOrigin(
+ lister_id=lister.lister_obj.id,
+ visit_type="svn",
+ url="svn.code.sf.net/p/backapps/website/code",
+ last_update=iso8601.parse_date("2021-02-11"),
+ ),
+ ]
+ swh_scheduler.record_listed_origins(faked_listed_origins)
+
+ to_date = datetime.date.fromisoformat
+ faked_state = SourceForgeListerState(
+ subsitemap_last_modified={
+ # changed
+ "https://sourceforge.net/allura_sitemap/sitemap-0.xml": to_date(
+ "2021-02-18"
+ ),
+ # stayed the same
+ "https://sourceforge.net/allura_sitemap/sitemap-1.xml": to_date(
+ "2021-03-18"
+ ),
+ },
+ empty_projects={
+ "https://sourceforge.net/rest/p/backapps": to_date("2020-02-11"),
+ "https://sourceforge.net/rest/adobe/adobexmp": to_date("2017-10-17"),
+ },
+ )
+ lister.state = faked_state
+
+ stats = lister.run()
+ # - os3dmodels (2 repos) # changed,
+ # - mramm (3 repos), # changed
+ assert stats.pages == 2
+ assert stats.origins == 5
+ expected_state = {
+ "subsitemap_last_modified": {
+ "https://sourceforge.net/allura_sitemap/sitemap-0.xml": "2021-03-18",
+ "https://sourceforge.net/allura_sitemap/sitemap-1.xml": "2021-03-18",
+ },
+ "empty_projects": {
+ "https://sourceforge.net/rest/p/backapps": "2021-02-11", # changed
+ "https://sourceforge.net/rest/adobe/adobexmp": "2017-10-17",
+ },
}
+ assert lister.state_to_dict(lister.state) == expected_state
+
+ # origins have been updated
+ _check_listed_origins(lister, swh_scheduler)
def test_sourceforge_lister_retry(swh_scheduler, requests_mock, mocker, datadir):