diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information from dataclasses import asdict, dataclass +from datetime import datetime, timezone import logging import random from typing import Any, Dict, Iterator, Optional, Tuple @@ -31,9 +32,8 @@ class GitLabListerState: """State of the GitLabLister""" - last_seen_next_link: Optional[str] = None - """Last link header (not visited yet) during an incremental pass - + last_listing_date: Optional[str] = None + """Last date when listing started during an incremental pass """ @@ -117,6 +117,7 @@ self.incremental = incremental self.last_page: Optional[str] = None self.per_page = 100 + self.listing_date = datetime.now(timezone.utc).isoformat() self.session = requests.Session() self.session.headers.update( @@ -133,7 +134,7 @@ self.session.headers["Authorization"] = f"Bearer {api_token}" def state_from_dict(self, d: Dict[str, Any]) -> GitLabListerState: - return GitLabListerState(**d) + return GitLabListerState(last_listing_date=d.get("last_listing_date")) def state_to_dict(self, state: GitLabListerState) -> Dict[str, Any]: return asdict(state) @@ -188,14 +189,12 @@ } if id_after is not None: parameters["id_after"] = str(id_after) + if self.incremental and self.state and self.state.last_listing_date: + parameters["last_activity_after"] = self.state.last_listing_date return f"{self.url}/projects?{urlencode(parameters)}" def get_pages(self) -> Iterator[PageResult]: - next_page: Optional[str] - if self.incremental and self.state and self.state.last_seen_next_link: - next_page = self.state.last_seen_next_link - else: - next_page = self.page_url() + next_page = self.page_url() while next_page: self.last_page = next_page @@ -217,49 +216,12 @@ last_update=iso8601.parse_date(repo["last_activity_at"]), ) - def commit_page(self, page_result: PageResult) -> None: - """Update currently stored state using the latest listed "next" page if relevant. - - Relevancy is determined by the next_page link whose 'page' id must be strictly - superior to the currently stored one. - - Note: this is a noop for full listing mode - - """ - if self.incremental: - # link: https://${project-api}/?...&page=2x... - next_page = page_result.next_page - if not next_page and self.last_page: - next_page = self.last_page - - if next_page: - id_after = _parse_id_after(next_page) - previous_next_page = self.state.last_seen_next_link - previous_id_after = _parse_id_after(previous_next_page) - - if previous_next_page is None or ( - previous_id_after and id_after and previous_id_after < id_after - ): - self.state.last_seen_next_link = next_page - def finalize(self) -> None: """finalize the lister state when relevant (see `fn:commit_page` for details) Note: this is a noop for full listing mode """ - next_page = self.state.last_seen_next_link - if self.incremental and next_page: - # link: https://${project-api}/?...&page=2x... - next_id_after = _parse_id_after(next_page) - scheduler_state = self.get_state_from_scheduler() - previous_next_id_after = _parse_id_after( - scheduler_state.last_seen_next_link - ) - - if (not previous_next_id_after and next_id_after) or ( - previous_next_id_after - and next_id_after - and previous_next_id_after < next_id_after - ): - self.updated = True + if self.incremental: + self.state.last_listing_date = self.listing_date + self.updated = True diff --git a/swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page1.json b/swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page1.json --- a/swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page1.json +++ b/swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page1.json @@ -6,7 +6,7 @@ "description": "Miscellaneous utils that are commonly used in multiple projects.", "forks_count": 0, "http_url_to_repo": "https://gite.lirmm.fr/yuquan/roboticsutils.git", - "id": 4456, + "id": 1, "last_activity_at": "2021-01-14T11:32:50.672Z", "name": "RoboticsUtils", "name_with_namespace": "Wang Yuquan / RoboticsUtils", @@ -26,7 +26,7 @@ "description": "", "forks_count": 0, "http_url_to_repo": "https://gite.lirmm.fr/constraint-acquisition-team/pacq.git", - "id": 4444, + "id": 2, "last_activity_at": "2020-12-15T19:43:53.678Z", "name": "PACQ", "name_with_namespace": "Constraint Acquisition Team / PACQ", @@ -39,4 +39,4 @@ "tag_list": [], "web_url": "https://gite.lirmm.fr/constraint-acquisition-team/pacq" } -] +] \ No newline at end of file diff --git a/swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page2.json b/swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page2.json --- a/swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page2.json +++ b/swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page2.json @@ -6,7 +6,7 @@ "description": "", "forks_count": 0, "http_url_to_repo": "https://gite.lirmm.fr/mgardeisen/citest.git", - "id": 4440, + "id": 3, "last_activity_at": "2021-01-21T14:37:31.022Z", "name": "CItest", "name_with_namespace": "Marine Gardeisen / CItest", @@ -26,7 +26,7 @@ "description": "Can be used to enforce the conventional commits specification on a package, generate a changelog recommend the next version to release\r\nSee see https://conventionalcommits.org", "forks_count": 0, "http_url_to_repo": "https://gite.lirmm.fr/pid/environments/conventional_commits.git", - "id": 4428, + "id": 4, "last_activity_at": "2021-01-08T11:11:54.178Z", "name": "conventional_commits", "name_with_namespace": "pid / environments / conventional_commits", diff --git a/swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page3.json b/swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page3.json --- a/swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page3.json +++ b/swh/lister/gitlab/tests/data/https_gite.lirmm.fr/api_response_page3.json @@ -6,7 +6,7 @@ "description": "PID WRapper for urdfdom and urdfdom_headers project, provided by ROS but independent from ROS.", "forks_count": 0, "http_url_to_repo": "https://gite.lirmm.fr/rob-miscellaneous/wrappers/urdfdom.git", - "id": 4363, + "id": 5, "last_activity_at": "2020-11-19T08:56:18.573Z", "name": "urdfdom", "name_with_namespace": "rob-miscellaneous / wrappers / urdfdom", @@ -19,4 +19,4 @@ "tag_list": [], "web_url": "https://gite.lirmm.fr/rob-miscellaneous/wrappers/urdfdom" } -] +] \ No newline at end of file diff --git a/swh/lister/gitlab/tests/test_lister.py b/swh/lister/gitlab/tests/test_lister.py --- a/swh/lister/gitlab/tests/test_lister.py +++ b/swh/lister/gitlab/tests/test_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -7,6 +7,7 @@ import logging from pathlib import Path from typing import Dict, List +from urllib.parse import quote import pytest from requests.status_codes import codes @@ -92,9 +93,9 @@ assert listed_origin.last_update is not None -def gitlab_page_response(datadir, instance: str, id_after: int) -> List[Dict]: +def gitlab_page_response(datadir, instance: str, page_num: int) -> List[Dict]: """Return list of repositories (out of test dataset)""" - datapath = Path(datadir, f"https_{instance}", f"api_response_page{id_after}.json") + datapath = Path(datadir, f"https_{instance}", f"api_response_page{page_num}.json") return json.loads(datapath.read_text()) if datapath.exists else [] @@ -142,9 +143,9 @@ url_page1 = lister.page_url() response1 = gitlab_page_response(datadir, instance, 1) - url_page2 = lister.page_url(2) + url_page2 = lister.page_url(response1[-1]["id"]) response2 = gitlab_page_response(datadir, instance, 2) - url_page3 = lister.page_url(3) + url_page3 = lister.page_url(response2[-1]["id"]) response3 = gitlab_page_response(datadir, instance, 3) requests_mock.get( @@ -162,11 +163,25 @@ expected_nb_origins = len(response1) + len(response2) assert listed_result == ListerStats(pages=2, origins=expected_nb_origins) - assert lister.state.last_seen_next_link == url_page2 + assert lister.state.last_listing_date == lister.listing_date lister2 = GitLabLister(swh_scheduler, url=url, instance=instance, incremental=True) - # Lister will start back at the last stop + url_page2 = lister2.page_url() + response2 = gitlab_page_response(datadir, instance, 2) + url_page3 = lister2.page_url(response2[-1]["id"]) + response3 = gitlab_page_response(datadir, instance, 3) + + # in a real world scenario, incremental lister will list repositories whose + # have been modified since last listing date + + last_activity_param = ( + f"last_activity_after={quote(lister2.state.last_listing_date)}" + ) + + assert last_activity_param in url_page2 + assert last_activity_param in url_page3 + requests_mock.get( url_page2, [{"json": response2, "headers": {"Link": f"<{url_page3}>; rel=next"}}], @@ -183,7 +198,7 @@ assert listed_result2 == ListerStats( pages=2, origins=len(response2) + len(response3) ) - assert lister2.state.last_seen_next_link == url_page3 + assert lister2.state.last_listing_date == lister2.listing_date assert lister.lister_obj.id == lister2.lister_obj.id scheduler_origins = lister2.scheduler.get_listed_origins(