diff --git a/swh/lister/cran/list_all_packages.R b/swh/lister/cran/list_all_packages.R index b16e65d..5747bb4 100755 --- a/swh/lister/cran/list_all_packages.R +++ b/swh/lister/cran/list_all_packages.R @@ -1,9 +1,9 @@ #!/usr/bin/Rscript # This R script calls the buildin API to get list of # all the packages of R and their description, then convert the API # response to JSON string and print it -db <- tools::CRAN_package_db()[, c("Package", "Version")] +db <- tools::CRAN_package_db()[, c("Package", "Version", "Packaged")] dbjson <- jsonlite::toJSON(db) print(dbjson) \ No newline at end of file diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index 8395406..7dd8b08 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -1,98 +1,125 @@ # Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime, timezone import json import logging import subprocess -from typing import Dict, Iterator, List, Tuple +from typing import Dict, Iterator, List, Optional, Tuple import pkg_resources from swh.lister.pattern import StatelessLister from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) CRAN_MIRROR = "https://cran.r-project.org" PageType = List[Dict[str, str]] class CRANLister(StatelessLister[PageType]): """ List all packages hosted on The Comprehensive R Archive Network. """ LISTER_NAME = "CRAN" def __init__( self, scheduler: SchedulerInterface, ): super().__init__(scheduler, url=CRAN_MIRROR, instance="cran") def get_pages(self) -> Iterator[PageType]: """ Yields a single page containing all CRAN packages info. """ yield read_cran_data() def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None for package_info in page: origin_url, artifact_url = compute_origin_urls(package_info) yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="tar", + last_update=parse_packaged_date(package_info), extra_loader_arguments={ "artifacts": [ {"url": artifact_url, "version": package_info["Version"]} ] }, ) def read_cran_data() -> List[Dict[str, str]]: """ Runs R script which uses inbuilt API to return a json response containing data about the R packages. Returns: List of Dict about R packages. For example:: [ { 'Package': 'A3', 'Version': '1.0.0' }, { 'Package': 'abbyyR', 'Version': '0.5.4' }, ... ] """ filepath = pkg_resources.resource_filename("swh.lister.cran", "list_all_packages.R") logger.debug("Executing R script %s", filepath) response = subprocess.run(filepath, stdout=subprocess.PIPE, shell=False) return json.loads(response.stdout.decode("utf-8")) def compute_origin_urls(package_info: Dict[str, str]) -> Tuple[str, str]: """Compute the package url from the repo dict. Args: repo: dict with key 'Package', 'Version' Returns: the tuple project url, artifact url """ package = package_info["Package"] version = package_info["Version"] origin_url = f"{CRAN_MIRROR}/package={package}" artifact_url = f"{CRAN_MIRROR}/src/contrib/{package}_{version}.tar.gz" return origin_url, artifact_url + + +def parse_packaged_date(package_info: Dict[str, str]) -> Optional[datetime]: + packaged_at_str = package_info.get("Packaged", "") + packaged_at = None + if packaged_at_str: + try: + # Packaged field format: "%Y-%m-%d %H:%M:%S UTC; ", + packaged_at = datetime.strptime( + packaged_at_str.split(" UTC;")[0], "%Y-%m-%d %H:%M:%S", + ).replace(tzinfo=timezone.utc) + except Exception: + try: + # Some old packages have a different date format: + # "%a %b %d %H:%M:%S %Y; " + packaged_at = datetime.strptime( + packaged_at_str.split(";")[0], "%a %b %d %H:%M:%S %Y", + ).replace(tzinfo=timezone.utc) + except Exception: + logger.debug( + "Could not parse %s package release date: %s", + package_info["Package"], + packaged_at_str, + ) + return packaged_at diff --git a/swh/lister/cran/tests/data/list-r-packages.json b/swh/lister/cran/tests/data/list-r-packages.json index 7043057..70ef69c 100644 --- a/swh/lister/cran/tests/data/list-r-packages.json +++ b/swh/lister/cran/tests/data/list-r-packages.json @@ -1,28 +1,40 @@ [ { "Package": "SeleMix", - "Version": "1.0.1" + "Version": "1.0.2", + "Packaged": "2020-11-28 22:16:43 UTC; Teresa" }, { "Package": "plink", - "Version": "1.5-1" + "Version": "1.5-1", + "Packaged": "2017-04-26 11:36:15 UTC; Jonathan" }, { - "Package": "justifier", - "Version": "0.1.0" + "Package": "jsonlite", + "Version": "1.7.2", + "Packaged": "2020-12-09 13:54:18 UTC; jeroen" + }, { "Package": "Records", - "Version": "1.0" + "Version": "1.0", + "Packaged": "2012-10-29 08:57:37 UTC; ripley" }, { "Package": "scRNAtools", - "Version": "1.0" + "Version": "1.0", + "Packaged": "2018-07-04 00:49:45 UTC; dell" }, { "Package": "Deriv", - "Version": "3.9.0" + "Version": "4.1.2", + "Packaged": "2020-12-10 11:12:28 UTC; sokol" + }, + { + "Package": "BayesValidate", + "Version": "0.0", + "Packaged": "Thu Mar 30 10:48:35 2006; hornik" } ] \ No newline at end of file diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py index ca88f0b..1530a8c 100644 --- a/swh/lister/cran/tests/test_lister.py +++ b/swh/lister/cran/tests/test_lister.py @@ -1,57 +1,91 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime, timezone import json from os import path import pytest -from swh.lister.cran.lister import CRAN_MIRROR, CRANLister, compute_origin_urls +from swh.lister.cran.lister import ( + CRAN_MIRROR, + CRANLister, + compute_origin_urls, + parse_packaged_date, +) def test_cran_compute_origin_urls(): pack = "something" vers = "0.0.1" origin_url, artifact_url = compute_origin_urls({"Package": pack, "Version": vers,}) assert origin_url == f"{CRAN_MIRROR}/package={pack}" assert artifact_url == f"{CRAN_MIRROR}/src/contrib/{pack}_{vers}.tar.gz" def test_cran_compute_origin_urls_failure(): for incomplete_repo in [{"Version": "0.0.1"}, {"Package": "package"}, {}]: with pytest.raises(KeyError): compute_origin_urls(incomplete_repo) +def test_parse_packaged_date(): + common_date_format = { + "Package": "test", + "Packaged": "2017-04-26 11:36:15 UTC; Jonathan", + } + assert parse_packaged_date(common_date_format) == datetime( + year=2017, month=4, day=26, hour=11, minute=36, second=15, tzinfo=timezone.utc + ) + old_date_format = { + "Package": "test", + "Packaged": "Thu Mar 30 10:48:35 2006; hornik", + } + assert parse_packaged_date(old_date_format) == datetime( + year=2006, month=3, day=30, hour=10, minute=48, second=35, tzinfo=timezone.utc + ) + invalid_date_format = { + "Package": "test", + "Packaged": "foo", + } + assert parse_packaged_date(invalid_date_format) is None + missing_date = { + "Package": "test", + } + assert parse_packaged_date(missing_date) is None + + def test_cran_lister_cran(datadir, swh_scheduler, mocker): with open(path.join(datadir, "list-r-packages.json")) as f: cran_data = json.loads(f.read()) lister = CRANLister(swh_scheduler) mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data") mock_cran.return_value = cran_data stats = lister.run() assert stats.pages == 1 assert stats.origins == len(cran_data) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == len(cran_data) for package_info in cran_data: origin_url, artifact_url = compute_origin_urls(package_info) filtered_origins = [o for o in scheduler_origins if o.url == origin_url] assert len(filtered_origins) == 1 assert filtered_origins[0].extra_loader_arguments == { "artifacts": [{"url": artifact_url, "version": package_info["Version"]}] } + + filtered_origins[0].last_update == parse_packaged_date(package_info)