diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -4,6 +4,7 @@ from datetime import datetime, timezone import json +import locale import logging import subprocess from typing import Dict, Iterator, List, Optional, Tuple @@ -45,9 +46,18 @@ def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None + + origin_urls = set() for package_info in page: origin_url, artifact_url = compute_origin_urls(package_info) + if origin_url in origin_urls: + # prevent multiple listing of an origin, + # most recent version will be listed first + continue + + origin_urls.add(origin_url) + yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, @@ -105,25 +115,32 @@ def parse_packaged_date(package_info: Dict[str, str]) -> Optional[datetime]: + locale.setlocale(locale.LC_TIME, "en_US") packaged_at_str = package_info.get("Packaged", "") packaged_at = None if packaged_at_str: - try: - # Packaged field format: "%Y-%m-%d %H:%M:%S UTC; ", - packaged_at = datetime.strptime( - packaged_at_str.split(" UTC;")[0], "%Y-%m-%d %H:%M:%S", - ).replace(tzinfo=timezone.utc) - except Exception: + packaged_at_str = packaged_at_str.replace(" UTC", "") + # Packaged field possible formats: + # - "%Y-%m-%d %H:%M:%S[.%f] UTC; ", + # - "%a %b %d %H:%M:%S %Y; " + for date_format in ( + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d %H:%M:%S.%f", + "%a %b %d %H:%M:%S %Y", + ): try: - # Some old packages have a different date format: - # "%a %b %d %H:%M:%S %Y; " packaged_at = datetime.strptime( - packaged_at_str.split(";")[0], "%a %b %d %H:%M:%S %Y", + packaged_at_str.split(";")[0], date_format, ).replace(tzinfo=timezone.utc) + break except Exception: - logger.debug( - "Could not parse %s package release date: %s", - package_info["Package"], - packaged_at_str, - ) + continue + + if packaged_at is None: + logger.debug( + "Could not parse %s package release date: %s", + package_info["Package"], + packaged_at_str, + ) + return packaged_at diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py --- a/swh/lister/cran/tests/test_lister.py +++ b/swh/lister/cran/tests/test_lister.py @@ -40,6 +40,20 @@ assert parse_packaged_date(common_date_format) == datetime( year=2017, month=4, day=26, hour=11, minute=36, second=15, tzinfo=timezone.utc ) + common_date_format = { + "Package": "test", + "Packaged": "2017-04-26 11:36:15.123456 UTC; Jonathan", + } + assert parse_packaged_date(common_date_format) == datetime( + year=2017, + month=4, + day=26, + hour=11, + minute=36, + second=15, + microsecond=123456, + tzinfo=timezone.utc, + ) old_date_format = { "Package": "test", "Packaged": "Thu Mar 30 10:48:35 2006; hornik", @@ -91,6 +105,22 @@ filtered_origins[0].last_update == parse_packaged_date(package_info) +def test_cran_lister_duplicated_origins(datadir, swh_scheduler, mocker): + with open(path.join(datadir, "list-r-packages.json")) as f: + cran_data = json.loads(f.read()) + + lister = CRANLister(swh_scheduler) + + mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data") + + mock_cran.return_value = cran_data + cran_data + + stats = lister.run() + + assert stats.pages == 1 + assert stats.origins == len(cran_data) + + @pytest.mark.parametrize( "credentials, expected_credentials", [