diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -28,12 +28,19 @@ This lister will retrieve the list of published git repositories by parsing the HTML page(s) of the index retrieved at `url`. - For each found git repository, a query is made at the given url found - in this index to gather published "Clone" URLs to be used as origin - URL for that git repo. + The lister currently defines 2 listing behaviors: + + - If the `base_git_url` is provided, the listed origin urls are computed out of the + base git url link and the one listed in the main listed page (resulting in less + HTTP queries than the 2nd behavior below). This is expected to be the main + deployed behavior. + + - Otherwise (with no `base_git_url`), for each found git repository listed, one + extra HTTP query is made at the given url found in the main listing page to gather + published "Clone" URLs to be used as origin URL for that git repo. If several + "Clone" urls are provided, prefer the http/https one, if any, otherwise fallback + to the first one. - If several "Clone" urls are provided, prefer the http/https one, if - any, otherwise fallback to the first one. """ LISTER_NAME = "cgit" @@ -44,14 +51,17 @@ url: str, instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, + base_git_url: Optional[str] = None, ): """Lister class for CGit repositories. Args: - url (str): main URL of the CGit instance, i.e. url of the index + url: main URL of the CGit instance, i.e. url of the index of published git repositories on this instance. - instance (str): Name of cgit instance. Defaults to url's hostname + instance: Name of cgit instance. Defaults to url's hostname if unset. + base_git_url: Optional base git url which allows the origin url + computations. """ if not instance: @@ -66,6 +76,7 @@ self.session.headers.update( {"Accept": "application/html", "User-Agent": USER_AGENT} ) + self.base_git_url = base_git_url def _get_and_parse(self, url: str) -> BeautifulSoup: """Get the given url and parse the retrieved HTML using BeautifulSoup""" @@ -87,7 +98,19 @@ for tr in bs_idx.find("div", {"class": "content"}).find_all( "tr", {"class": ""} ): - url = urljoin(self.url, tr.find("a")["href"]) + repository_link = tr.find("a")["href"] + repo_url = None + git_url = None + + base_url = urljoin(self.url, repository_link) + if self.base_git_url: # mapping provided + # computing git url + git_url = base_url.replace(self.url, self.base_git_url) + else: + # we compute the git detailed page url from which we will retrieve + # the git url (cf. self.get_origins_from_page) + repo_url = base_url + span = tr.find("span", {"class": re.compile("age-")}) if span: last_updated_date = span["title"] @@ -95,7 +118,11 @@ last_updated_date = None page_results.append( - {"url": url, "last_updated_date": last_updated_date} + { + "url": repo_url, + "git_url": git_url, + "last_updated_date": last_updated_date, + } ) yield page_results @@ -117,8 +144,10 @@ """Convert a page of cgit repositories into a list of ListedOrigins.""" assert self.lister_obj.id is not None - for repository in repositories: - origin_url = self._get_origin_from_repository_url(repository["url"]) + for repo in repositories: + origin_url = repo["git_url"] or self._get_origin_from_repository_url( + repo["url"] + ) if origin_url is None: continue @@ -126,7 +155,7 @@ lister_id=self.lister_obj.id, url=origin_url, visit_type="git", - last_update=_parse_last_updated_date(repository), + last_update=_parse_last_updated_date(repo), ) def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]: diff --git a/swh/lister/cgit/tasks.py b/swh/lister/cgit/tasks.py --- a/swh/lister/cgit/tasks.py +++ b/swh/lister/cgit/tasks.py @@ -10,9 +10,13 @@ @shared_task(name=__name__ + ".CGitListerTask") -def list_cgit(url: str, instance: Optional[str] = None,) -> Dict[str, str]: +def list_cgit( + url: str, instance: Optional[str] = None, base_git_url: Optional[str] = None +) -> Dict[str, str]: """Lister task for CGit instances""" - lister = CGitLister.from_configfile(url=url, instance=instance) + lister = CGitLister.from_configfile( + url=url, instance=instance, base_git_url=base_git_url + ) return lister.run().dict() diff --git a/swh/lister/cgit/tests/data/https_git.baserock.org/cgit b/swh/lister/cgit/tests/data/https_git.baserock.org/cgit new file mode 100644 --- /dev/null +++ b/swh/lister/cgit/tests/data/https_git.baserock.org/cgit @@ -0,0 +1,33 @@ + + + + Lorry Depot + + + + + + +
+ + + + +
+ index
+ + +
+
+ + + + +
NameDescriptionIdleLinks
baserock/baserock/baserock-chroot.gitTools for working with Baserock chroots + 6 yearssummarylogtree
baserock/baserock/bsp-support.gitPrograms and configuration needed to support specific devices + 5 yearssummarylogtree
baserock/baserock/definitions.gitgitlab.com: baserock/definitions.git + 21 monthssummarylogtree
+
+
+ + diff --git a/swh/lister/cgit/tests/data/https_git.eclipse.org/c b/swh/lister/cgit/tests/data/https_git.eclipse.org/c new file mode 100644 --- /dev/null +++ b/swh/lister/cgit/tests/data/https_git.eclipse.org/c @@ -0,0 +1,42 @@ + + + + Eclipse Git repositories + + + + + + + + + +
+
+ +
+

+ Back to the top +

+ + diff --git a/swh/lister/cgit/tests/data/https_jff.email/cgit b/swh/lister/cgit/tests/data/https_jff.email/cgit new file mode 100644 --- /dev/null +++ b/swh/lister/cgit/tests/data/https_jff.email/cgit @@ -0,0 +1,33 @@ + + + + Jörgs Debian Repository + + + + + + +
+ + + + +
+ index
+ + +
+
+ + + + + + + +
NameDescriptionOwnerIdle
gnome-pie.gitDebian repo for gnome-piedebian@jff.email8 months
gs5.gitClone of gs5jff@jff.email2 years
libunistring.gitDebian repo for libunistringdebian@jff.email8 months
mailgraph.gitDebian repo for mailgraphdebian@jff.email2 years
mwc.gitDebian repo for mwcdebian@jff.email19 months
xtrkcad.gitDebian repo for xtrkcaddebian@jff.email5 months
+ +
+ + diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py --- a/swh/lister/cgit/tests/test_lister.py +++ b/swh/lister/cgit/tests/test_lister.py @@ -196,3 +196,38 @@ lister = CGitLister.from_configfile() assert lister.scheduler is not None assert lister.credentials is not None + + +@pytest.mark.parametrize( + "url,base_git_url,expected_nb_origins", + [ + ("https://git.eclipse.org/c", "https://eclipse.org/r", 5), + ("https://git.baserock.org/cgit/", "https://git.baserock.org/git/", 3), + ("https://jff.email/cgit/", "git://jff.email/opt/git/", 6), + ], +) +def test_lister_cgit_with_base_git_url( + url, base_git_url, expected_nb_origins, requests_mock_datadir, swh_scheduler +): + """With base git url provided, listed urls should be the computed origin urls + + """ + lister_cgit = CGitLister(swh_scheduler, url=url, base_git_url=base_git_url,) + + stats = lister_cgit.run() + + assert stats == ListerStats(pages=1, origins=expected_nb_origins) + + # test page parsing + scheduler_origins = swh_scheduler.get_listed_origins( + lister_cgit.lister_obj.id + ).results + assert len(scheduler_origins) == expected_nb_origins + + # test listed repositories + for listed_origin in scheduler_origins: + assert listed_origin.visit_type == "git" + assert listed_origin.url.startswith(base_git_url) + assert ( + listed_origin.url.startswith(url) is False + ), f"url should be mapped to {base_git_url}" diff --git a/swh/lister/cgit/tests/test_tasks.py b/swh/lister/cgit/tests/test_tasks.py --- a/swh/lister/cgit/tests/test_tasks.py +++ b/swh/lister/cgit/tests/test_tasks.py @@ -22,7 +22,7 @@ lister.from_configfile.return_value = lister lister.run.return_value = ListerStats(pages=10, origins=500) - kwargs = dict(url="https://git.kernel.org/", instance="kernel") + kwargs = dict(url="https://git.kernel.org/", instance="kernel", base_git_url=None) res = swh_scheduler_celery_app.send_task( "swh.lister.cgit.tasks.CGitListerTask", kwargs=kwargs,