diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index d843c2f..635a7a6 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -1,135 +1,144 @@ # Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import json import logging import subprocess from typing import Dict, Iterator, List, Optional, Tuple import pkg_resources from swh.lister.pattern import CredentialsType, StatelessLister from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) CRAN_MIRROR = "https://cran.r-project.org" PageType = List[Dict[str, str]] class CRANLister(StatelessLister[PageType]): """ List all packages hosted on The Comprehensive R Archive Network. """ LISTER_NAME = "CRAN" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, ): super().__init__( scheduler, url=CRAN_MIRROR, instance="cran", credentials=credentials ) def get_pages(self) -> Iterator[PageType]: """ Yields a single page containing all CRAN packages info. """ yield read_cran_data() def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None + + seen_urls = set() for package_info in page: origin_url, artifact_url = compute_origin_urls(package_info) + if origin_url in seen_urls: + # prevent multiple listing of an origin, + # most recent version will be listed first + continue + + seen_urls.add(origin_url) + yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="tar", last_update=parse_packaged_date(package_info), extra_loader_arguments={ "artifacts": [ {"url": artifact_url, "version": package_info["Version"]} ] }, ) def read_cran_data() -> List[Dict[str, str]]: """ Runs R script which uses inbuilt API to return a json response containing data about the R packages. Returns: List of Dict about R packages. For example:: [ { 'Package': 'A3', 'Version': '1.0.0' }, { 'Package': 'abbyyR', 'Version': '0.5.4' }, ... ] """ filepath = pkg_resources.resource_filename("swh.lister.cran", "list_all_packages.R") logger.debug("Executing R script %s", filepath) response = subprocess.run(filepath, stdout=subprocess.PIPE, shell=False) return json.loads(response.stdout.decode("utf-8")) def compute_origin_urls(package_info: Dict[str, str]) -> Tuple[str, str]: """Compute the package url from the repo dict. Args: repo: dict with key 'Package', 'Version' Returns: the tuple project url, artifact url """ package = package_info["Package"] version = package_info["Version"] origin_url = f"{CRAN_MIRROR}/package={package}" artifact_url = f"{CRAN_MIRROR}/src/contrib/{package}_{version}.tar.gz" return origin_url, artifact_url def parse_packaged_date(package_info: Dict[str, str]) -> Optional[datetime]: packaged_at_str = package_info.get("Packaged", "") packaged_at = None if packaged_at_str: packaged_at_str = packaged_at_str.replace(" UTC", "") # Packaged field possible formats: # - "%Y-%m-%d %H:%M:%S[.%f] UTC; ", # - "%a %b %d %H:%M:%S %Y; " for date_format in ( "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S.%f", "%a %b %d %H:%M:%S %Y", ): try: packaged_at = datetime.strptime( packaged_at_str.split(";")[0], date_format, ).replace(tzinfo=timezone.utc) break except Exception: continue if packaged_at is None: logger.debug( "Could not parse %s package release date: %s", package_info["Package"], packaged_at_str, ) return packaged_at diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py index b8822ec..f8707d1 100644 --- a/swh/lister/cran/tests/test_lister.py +++ b/swh/lister/cran/tests/test_lister.py @@ -1,136 +1,152 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import json from os import path import pytest from swh.lister.cran.lister import ( CRAN_MIRROR, CRANLister, compute_origin_urls, parse_packaged_date, ) def test_cran_compute_origin_urls(): pack = "something" vers = "0.0.1" origin_url, artifact_url = compute_origin_urls({"Package": pack, "Version": vers,}) assert origin_url == f"{CRAN_MIRROR}/package={pack}" assert artifact_url == f"{CRAN_MIRROR}/src/contrib/{pack}_{vers}.tar.gz" def test_cran_compute_origin_urls_failure(): for incomplete_repo in [{"Version": "0.0.1"}, {"Package": "package"}, {}]: with pytest.raises(KeyError): compute_origin_urls(incomplete_repo) def test_parse_packaged_date(): common_date_format = { "Package": "test", "Packaged": "2017-04-26 11:36:15 UTC; Jonathan", } assert parse_packaged_date(common_date_format) == datetime( year=2017, month=4, day=26, hour=11, minute=36, second=15, tzinfo=timezone.utc ) common_date_format = { "Package": "test", "Packaged": "2017-04-26 11:36:15.123456 UTC; Jonathan", } assert parse_packaged_date(common_date_format) == datetime( year=2017, month=4, day=26, hour=11, minute=36, second=15, microsecond=123456, tzinfo=timezone.utc, ) old_date_format = { "Package": "test", "Packaged": "Thu Mar 30 10:48:35 2006; hornik", } assert parse_packaged_date(old_date_format) == datetime( year=2006, month=3, day=30, hour=10, minute=48, second=35, tzinfo=timezone.utc ) invalid_date_format = { "Package": "test", "Packaged": "foo", } assert parse_packaged_date(invalid_date_format) is None missing_date = { "Package": "test", } assert parse_packaged_date(missing_date) is None def test_cran_lister_cran(datadir, swh_scheduler, mocker): with open(path.join(datadir, "list-r-packages.json")) as f: cran_data = json.loads(f.read()) lister = CRANLister(swh_scheduler) mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data") mock_cran.return_value = cran_data stats = lister.run() assert stats.pages == 1 assert stats.origins == len(cran_data) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == len(cran_data) for package_info in cran_data: origin_url, artifact_url = compute_origin_urls(package_info) filtered_origins = [o for o in scheduler_origins if o.url == origin_url] assert len(filtered_origins) == 1 assert filtered_origins[0].extra_loader_arguments == { "artifacts": [{"url": artifact_url, "version": package_info["Version"]}] } filtered_origins[0].last_update == parse_packaged_date(package_info) +def test_cran_lister_duplicated_origins(datadir, swh_scheduler, mocker): + with open(path.join(datadir, "list-r-packages.json")) as f: + cran_data = json.loads(f.read()) + + lister = CRANLister(swh_scheduler) + + mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data") + + mock_cran.return_value = cran_data + cran_data + + stats = lister.run() + + assert stats.pages == 1 + assert stats.origins == len(cran_data) + + @pytest.mark.parametrize( "credentials, expected_credentials", [ (None, []), ({"key": "value"}, []), ( {"CRAN": {"cran": [{"username": "user", "password": "pass"}]}}, [{"username": "user", "password": "pass"}], ), ], ) def test_lister_cran_instantiation_with_credentials( credentials, expected_credentials, swh_scheduler ): lister = CRANLister(swh_scheduler, credentials=credentials) # Credentials are allowed in constructor assert lister.credentials == expected_credentials def test_lister_cran_from_configfile(swh_scheduler_config, mocker): load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") load_from_envvar.return_value = { "scheduler": {"cls": "local", **swh_scheduler_config}, "credentials": {}, } lister = CRANLister.from_configfile() assert lister.scheduler is not None assert lister.credentials is not None