diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py
--- a/swh/lister/cgit/lister.py
+++ b/swh/lister/cgit/lister.py
@@ -28,12 +28,18 @@
This lister will retrieve the list of published git repositories by
parsing the HTML page(s) of the index retrieved at `url`.
- For each found git repository, a query is made at the given url found
- in this index to gather published "Clone" URLs to be used as origin
- URL for that git repo.
+ The lister currently defines 2 listing behaviors:
+
+ - If the `base_git_url` is provided, the listed origin urls are computed out of the
+ base git url link and the one listed in the main listed page (resulting in less
+ queries than the 2nd behavior below). This is expected to be the main behavior
+ used.
+
+ - Otherwise, with no `base_git_url`, for each found git repository, a query is made
+ at the given url found in this index to gather published "Clone" URLs to be used
+ as origin URL for that git repo. If several "Clone" urls are provided, prefer the
+ http/https one, if any, otherwise fallback to the first one.
- If several "Clone" urls are provided, prefer the http/https one, if
- any, otherwise fallback to the first one.
"""
LISTER_NAME = "cgit"
@@ -44,14 +50,17 @@
url: str,
instance: Optional[str] = None,
credentials: Optional[CredentialsType] = None,
+ base_git_url: Optional[str] = None,
):
"""Lister class for CGit repositories.
Args:
- url (str): main URL of the CGit instance, i.e. url of the index
+ url: main URL of the CGit instance, i.e. url of the index
of published git repositories on this instance.
- instance (str): Name of cgit instance. Defaults to url's hostname
+ instance: Name of cgit instance. Defaults to url's hostname
if unset.
+ base_git_url: Optional base git url which allows the origin url
+ computations.
"""
if not instance:
@@ -66,6 +75,7 @@
self.session.headers.update(
{"Accept": "application/html", "User-Agent": USER_AGENT}
)
+ self.base_git_url = base_git_url
def _get_and_parse(self, url: str) -> BeautifulSoup:
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
@@ -87,7 +97,19 @@
for tr in bs_idx.find("div", {"class": "content"}).find_all(
"tr", {"class": ""}
):
- url = urljoin(self.url, tr.find("a")["href"])
+ repository_link = tr.find("a")["href"]
+ repo_url = None
+ git_url = None
+
+ if self.base_git_url: # mapping provided
+ repo_url = urljoin(self.url, repository_link)
+ # computing git url
+ git_url = repo_url.replace(self.url, self.base_git_url)
+ else:
+ # we compute the git detailed page url from which we will retrieve
+ # the git url (cf. self.get_origins_from_page)
+ repo_url = urljoin(self.url, repository_link)
+
span = tr.find("span", {"class": re.compile("age-")})
if span:
last_updated_date = span["title"]
@@ -95,7 +117,11 @@
last_updated_date = None
page_results.append(
- {"url": url, "last_updated_date": last_updated_date}
+ {
+ "url": repo_url,
+ "git_url": git_url,
+ "last_updated_date": last_updated_date,
+ }
)
yield page_results
@@ -117,8 +143,10 @@
"""Convert a page of cgit repositories into a list of ListedOrigins."""
assert self.lister_obj.id is not None
- for repository in repositories:
- origin_url = self._get_origin_from_repository_url(repository["url"])
+ for repo in repositories:
+ origin_url = repo["git_url"] or self._get_origin_from_repository_url(
+ repo["url"]
+ )
if origin_url is None:
continue
@@ -126,7 +154,7 @@
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="git",
- last_update=_parse_last_updated_date(repository),
+ last_update=_parse_last_updated_date(repo),
)
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
diff --git a/swh/lister/cgit/tasks.py b/swh/lister/cgit/tasks.py
--- a/swh/lister/cgit/tasks.py
+++ b/swh/lister/cgit/tasks.py
@@ -10,9 +10,13 @@
@shared_task(name=__name__ + ".CGitListerTask")
-def list_cgit(url: str, instance: Optional[str] = None,) -> Dict[str, str]:
+def list_cgit(
+ url: str, instance: Optional[str] = None, base_git_url: Optional[str] = None
+) -> Dict[str, str]:
"""Lister task for CGit instances"""
- lister = CGitLister.from_configfile(url=url, instance=instance)
+ lister = CGitLister.from_configfile(
+ url=url, instance=instance, base_git_url=base_git_url
+ )
return lister.run().dict()
diff --git a/swh/lister/cgit/tests/data/https_git.eclipse.org/c b/swh/lister/cgit/tests/data/https_git.eclipse.org/c
new file mode 100644
--- /dev/null
+++ b/swh/lister/cgit/tests/data/https_git.eclipse.org/c
@@ -0,0 +1,42 @@
+
+
+
+ Eclipse Git repositories
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Back to the top
+
+
+
diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py
--- a/swh/lister/cgit/tests/test_lister.py
+++ b/swh/lister/cgit/tests/test_lister.py
@@ -196,3 +196,31 @@
lister = CGitLister.from_configfile()
assert lister.scheduler is not None
assert lister.credentials is not None
+
+
+def test_lister_cgit_with_base_git_url(requests_mock_datadir, swh_scheduler):
+ """With base git url provided, listed urls should be the computed origin urls
+
+ """
+ url = "https://git.eclipse.org/c"
+ base_git_url = "https://eclipse.org/r"
+ lister_cgit = CGitLister(swh_scheduler, url=url, base_git_url=base_git_url,)
+
+ stats = lister_cgit.run()
+
+ expected_nb_origins = 5 # 5 "sublevel-section" class in html
+ assert stats == ListerStats(pages=1, origins=expected_nb_origins)
+
+ # test page parsing
+ scheduler_origins = swh_scheduler.get_listed_origins(
+ lister_cgit.lister_obj.id
+ ).results
+ assert len(scheduler_origins) == expected_nb_origins
+
+ # test listed repositories
+ for listed_origin in scheduler_origins:
+ assert listed_origin.visit_type == "git"
+ assert listed_origin.url.startswith(base_git_url)
+ assert (
+ listed_origin.url.startswith(url) is False
+ ), f"url should be mapped to {base_git_url}"
diff --git a/swh/lister/cgit/tests/test_tasks.py b/swh/lister/cgit/tests/test_tasks.py
--- a/swh/lister/cgit/tests/test_tasks.py
+++ b/swh/lister/cgit/tests/test_tasks.py
@@ -22,7 +22,7 @@
lister.from_configfile.return_value = lister
lister.run.return_value = ListerStats(pages=10, origins=500)
- kwargs = dict(url="https://git.kernel.org/", instance="kernel")
+ kwargs = dict(url="https://git.kernel.org/", instance="kernel", base_git_url=None)
res = swh_scheduler_celery_app.send_task(
"swh.lister.cgit.tasks.CGitListerTask", kwargs=kwargs,