diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py
index d0c35b5..1c57306 100644
--- a/swh/lister/cgit/lister.py
+++ b/swh/lister/cgit/lister.py
@@ -1,136 +1,173 @@
# Copyright (C) 2019-2021 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from datetime import datetime, timezone
import logging
-from typing import Iterator, List, Optional
+import re
+from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import requests
from swh.lister import USER_AGENT
from swh.lister.pattern import StatelessLister
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
logger = logging.getLogger(__name__)
-Repositories = List[str]
+Repositories = List[Dict[str, Any]]
class CGitLister(StatelessLister[Repositories]):
"""Lister class for CGit repositories.
This lister will retrieve the list of published git repositories by
parsing the HTML page(s) of the index retrieved at `url`.
For each found git repository, a query is made at the given url found
in this index to gather published "Clone" URLs to be used as origin
URL for that git repo.
If several "Clone" urls are provided, prefer the http/https one, if
any, otherwise fallback to the first one.
"""
LISTER_NAME = "cgit"
def __init__(
self, scheduler: SchedulerInterface, url: str, instance: Optional[str] = None
):
"""Lister class for CGit repositories.
Args:
url (str): main URL of the CGit instance, i.e. url of the index
of published git repositories on this instance.
instance (str): Name of cgit instance. Defaults to url's hostname
if unset.
"""
if not instance:
instance = urlparse(url).hostname
assert instance is not None # Make mypy happy
super().__init__(
scheduler=scheduler, credentials=None, url=url, instance=instance,
)
self.session = requests.Session()
self.session.headers.update(
{"Accept": "application/html", "User-Agent": USER_AGENT}
)
def _get_and_parse(self, url: str) -> BeautifulSoup:
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
response = self.session.get(url)
response.raise_for_status()
return BeautifulSoup(response.text, features="html.parser")
def get_pages(self) -> Iterator[Repositories]:
"""Generate git 'project' URLs found on the current CGit server
-
+ The last_update date is retrieved on the list of repo page to avoid
+ to compute it on the repository details which only give a date per branch
"""
next_page: Optional[str] = self.url
while next_page:
bs_idx = self._get_and_parse(next_page)
page_results = []
for tr in bs_idx.find("div", {"class": "content"}).find_all(
"tr", {"class": ""}
):
- page_results.append(urljoin(self.url, tr.find("a")["href"]))
+ url = urljoin(self.url, tr.find("a")["href"])
+ span = tr.find("span", {"class": re.compile("age-")})
+ if span:
+ last_updated_date = span["title"]
+ else:
+ last_updated_date = None
+
+ page_results.append(
+ {"url": url, "last_updated_date": last_updated_date}
+ )
yield page_results
try:
pager = bs_idx.find("ul", {"class": "pager"})
current_page = pager.find("a", {"class": "current"})
if current_page:
next_page = current_page.parent.next_sibling.a["href"]
next_page = urljoin(self.url, next_page)
except (AttributeError, KeyError):
# no pager, or no next page
next_page = None
def get_origins_from_page(
self, repositories: Repositories
) -> Iterator[ListedOrigin]:
"""Convert a page of cgit repositories into a list of ListedOrigins."""
assert self.lister_obj.id is not None
- for repository_url in repositories:
- origin_url = self._get_origin_from_repository_url(repository_url)
+ for repository in repositories:
+ origin_url = self._get_origin_from_repository_url(repository["url"])
if not origin_url:
continue
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="git",
- last_update=None,
+ last_update=_parse_last_updated_date(repository),
)
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
"""Extract the git url from the repository page"""
bs = self._get_and_parse(repository_url)
# origin urls are listed on the repository page
# TODO check if forcing https is better or not ?
#
#
#
urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})]
if not urls:
return None
# look for the http/https url, if any, and use it as origin_url
for url in urls:
if urlparse(url).scheme in ("http", "https"):
origin_url = url
break
else:
# otherwise, choose the first one
origin_url = urls[0]
return origin_url
+
+
+def _parse_last_updated_date(repository: Dict[str, Any]) -> Optional[datetime]:
+ """Parse the last updated date"""
+ date = repository.get("last_updated_date")
+ if not date:
+ return None
+
+ parsed_date = None
+ for date_format in ("%Y-%m-%d %H:%M:%S %z", "%Y-%m-%d %H:%M:%S (%Z)"):
+ try:
+ parsed_date = datetime.strptime(date, date_format)
+ # force UTC to avoid naive datetime
+ if not parsed_date.tzinfo:
+ parsed_date = parsed_date.replace(tzinfo=timezone.utc)
+ break
+ except Exception:
+ pass
+
+ if not parsed_date:
+ logger.warning(
+ "Could not parse %s last_updated date: %s", repository["url"], date,
+ )
+
+ return parsed_date
diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py
index 313eaec..75f7941 100644
--- a/swh/lister/cgit/tests/test_lister.py
+++ b/swh/lister/cgit/tests/test_lister.py
@@ -1,68 +1,144 @@
# Copyright (C) 2019-2021 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from datetime import datetime, timedelta, timezone
from typing import List
+import pytest
+
from swh.lister import __version__
-from swh.lister.cgit.lister import CGitLister
+from swh.lister.cgit.lister import CGitLister, _parse_last_updated_date
from swh.lister.pattern import ListerStats
def test_lister_cgit_get_pages_one_page(requests_mock_datadir, swh_scheduler):
url = "https://git.savannah.gnu.org/cgit/"
lister_cgit = CGitLister(swh_scheduler, url=url)
repos: List[List[str]] = list(lister_cgit.get_pages())
flattened_repos = sum(repos, [])
assert len(flattened_repos) == 977
- assert flattened_repos[0] == "https://git.savannah.gnu.org/cgit/elisp-es.git/"
+ assert (
+ flattened_repos[0]["url"] == "https://git.savannah.gnu.org/cgit/elisp-es.git/"
+ )
# note the url below is NOT a subpath of /cgit/
assert (
- flattened_repos[-1] == "https://git.savannah.gnu.org/path/to/yetris.git/"
+ flattened_repos[-1]["url"] == "https://git.savannah.gnu.org/path/to/yetris.git/"
) # noqa
# note the url below is NOT on the same server
- assert flattened_repos[-2] == "http://example.org/cgit/xstarcastle.git/"
+ assert flattened_repos[-2]["url"] == "http://example.org/cgit/xstarcastle.git/"
def test_lister_cgit_get_pages_with_pages(requests_mock_datadir, swh_scheduler):
url = "https://git.tizen/cgit/"
lister_cgit = CGitLister(swh_scheduler, url=url)
repos: List[List[str]] = list(lister_cgit.get_pages())
flattened_repos = sum(repos, [])
# we should have 16 repos (listed on 3 pages)
assert len(repos) == 3
assert len(flattened_repos) == 16
-def test_lister_cgit_run(requests_mock_datadir, swh_scheduler):
+def test_lister_cgit_run_with_page(requests_mock_datadir, swh_scheduler):
"""cgit lister supports pagination"""
url = "https://git.tizen/cgit/"
lister_cgit = CGitLister(swh_scheduler, url=url)
stats = lister_cgit.run()
expected_nb_origins = 16
assert stats == ListerStats(pages=3, origins=expected_nb_origins)
# test page parsing
scheduler_origins = swh_scheduler.get_listed_origins(
lister_cgit.lister_obj.id
).results
assert len(scheduler_origins) == expected_nb_origins
# test listed repositories
for listed_origin in scheduler_origins:
assert listed_origin.visit_type == "git"
assert listed_origin.url.startswith("https://git.tizen")
# test user agent content
assert len(requests_mock_datadir.request_history) != 0
for request in requests_mock_datadir.request_history:
assert "User-Agent" in request.headers
user_agent = request.headers["User-Agent"]
assert "Software Heritage Lister" in user_agent
assert __version__ in user_agent
+
+
+def test_lister_cgit_run_populates_last_update(requests_mock_datadir, swh_scheduler):
+ """cgit lister returns last updated date"""
+
+ url = "https://git.tizen/cgit"
+
+ urls_without_date = [
+ f"https://git.tizen.org/cgit/{suffix_url}"
+ for suffix_url in ["All-Projects", "All-Users", "Lock-Projects",]
+ ]
+
+ lister_cgit = CGitLister(swh_scheduler, url=url)
+
+ stats = lister_cgit.run()
+
+ expected_nb_origins = 16
+ assert stats == ListerStats(pages=3, origins=expected_nb_origins)
+
+ # test page parsing
+ scheduler_origins = swh_scheduler.get_listed_origins(
+ lister_cgit.lister_obj.id
+ ).results
+ assert len(scheduler_origins) == expected_nb_origins
+
+ # test listed repositories
+ for listed_origin in scheduler_origins:
+ if listed_origin.url in urls_without_date:
+ assert listed_origin.last_update is None
+ else:
+ assert listed_origin.last_update is not None
+
+
+@pytest.mark.parametrize(
+ "date_str,expected_date",
+ [
+ ({}, None),
+ ("unexpected date", None),
+ ("2020-0140-10 10:10:10 (GMT)", None),
+ (
+ "2020-01-10 10:10:10 (GMT)",
+ datetime(
+ year=2020,
+ month=1,
+ day=10,
+ hour=10,
+ minute=10,
+ second=10,
+ tzinfo=timezone.utc,
+ ),
+ ),
+ (
+ "2019-08-04 05:10:41 +0100",
+ datetime(
+ year=2019,
+ month=8,
+ day=4,
+ hour=5,
+ minute=10,
+ second=41,
+ tzinfo=timezone(timedelta(hours=1)),
+ ),
+ ),
+ ],
+)
+def test_lister_cgit_date_parsing(date_str, expected_date):
+ """test cgit lister date parsing"""
+
+ repository = {"url": "url", "last_updated_date": date_str}
+
+ assert _parse_last_updated_date(repository) == expected_date