diff --git a/PKG-INFO b/PKG-INFO index d6a0a8c..950c53c 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,126 +1,126 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 2.2.0 +Version: 2.3.0 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO index d6a0a8c..950c53c 100644 --- a/swh.lister.egg-info/PKG-INFO +++ b/swh.lister.egg-info/PKG-INFO @@ -1,126 +1,126 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 2.2.0 +Version: 2.3.0 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index abd7b1f..97d24cf 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -1,144 +1,148 @@ # Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import json import logging import subprocess from typing import Dict, Iterator, List, Optional, Tuple import pkg_resources from swh.lister.pattern import CredentialsType, StatelessLister from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) CRAN_MIRROR = "https://cran.r-project.org" PageType = List[Dict[str, str]] class CRANLister(StatelessLister[PageType]): """ List all packages hosted on The Comprehensive R Archive Network. """ LISTER_NAME = "CRAN" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, ): super().__init__( scheduler, url=CRAN_MIRROR, instance="cran", credentials=credentials ) def get_pages(self) -> Iterator[PageType]: """ Yields a single page containing all CRAN packages info. """ yield read_cran_data() def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None seen_urls = set() for package_info in page: origin_url, artifact_url = compute_origin_urls(package_info) if origin_url in seen_urls: # prevent multiple listing of an origin, # most recent version will be listed first continue seen_urls.add(origin_url) yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="cran", last_update=parse_packaged_date(package_info), extra_loader_arguments={ "artifacts": [ - {"url": artifact_url, "version": package_info["Version"]} + { + "url": artifact_url, + "version": package_info["Version"], + "package": package_info["Package"], + } ] }, ) def read_cran_data() -> List[Dict[str, str]]: """ Runs R script which uses inbuilt API to return a json response containing data about the R packages. Returns: List of Dict about R packages. For example:: [ { 'Package': 'A3', 'Version': '1.0.0' }, { 'Package': 'abbyyR', 'Version': '0.5.4' }, ... ] """ filepath = pkg_resources.resource_filename("swh.lister.cran", "list_all_packages.R") logger.debug("Executing R script %s", filepath) response = subprocess.run(filepath, stdout=subprocess.PIPE, shell=False) return json.loads(response.stdout.decode("utf-8")) def compute_origin_urls(package_info: Dict[str, str]) -> Tuple[str, str]: """Compute the package url from the repo dict. Args: repo: dict with key 'Package', 'Version' Returns: the tuple project url, artifact url """ package = package_info["Package"] version = package_info["Version"] origin_url = f"{CRAN_MIRROR}/package={package}" artifact_url = f"{CRAN_MIRROR}/src/contrib/{package}_{version}.tar.gz" return origin_url, artifact_url def parse_packaged_date(package_info: Dict[str, str]) -> Optional[datetime]: packaged_at_str = package_info.get("Packaged", "") packaged_at = None if packaged_at_str: packaged_at_str = packaged_at_str.replace(" UTC", "") # Packaged field possible formats: # - "%Y-%m-%d %H:%M:%S[.%f] UTC; ", # - "%a %b %d %H:%M:%S %Y; " for date_format in ( "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S.%f", "%a %b %d %H:%M:%S %Y", ): try: packaged_at = datetime.strptime( packaged_at_str.split(";")[0], date_format, ).replace(tzinfo=timezone.utc) break except Exception: continue if packaged_at is None: logger.debug( "Could not parse %s package release date: %s", package_info["Package"], packaged_at_str, ) return packaged_at diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py index f8707d1..fa0b463 100644 --- a/swh/lister/cran/tests/test_lister.py +++ b/swh/lister/cran/tests/test_lister.py @@ -1,152 +1,158 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timezone import json from os import path import pytest from swh.lister.cran.lister import ( CRAN_MIRROR, CRANLister, compute_origin_urls, parse_packaged_date, ) def test_cran_compute_origin_urls(): pack = "something" vers = "0.0.1" origin_url, artifact_url = compute_origin_urls({"Package": pack, "Version": vers,}) assert origin_url == f"{CRAN_MIRROR}/package={pack}" assert artifact_url == f"{CRAN_MIRROR}/src/contrib/{pack}_{vers}.tar.gz" def test_cran_compute_origin_urls_failure(): for incomplete_repo in [{"Version": "0.0.1"}, {"Package": "package"}, {}]: with pytest.raises(KeyError): compute_origin_urls(incomplete_repo) def test_parse_packaged_date(): common_date_format = { "Package": "test", "Packaged": "2017-04-26 11:36:15 UTC; Jonathan", } assert parse_packaged_date(common_date_format) == datetime( year=2017, month=4, day=26, hour=11, minute=36, second=15, tzinfo=timezone.utc ) common_date_format = { "Package": "test", "Packaged": "2017-04-26 11:36:15.123456 UTC; Jonathan", } assert parse_packaged_date(common_date_format) == datetime( year=2017, month=4, day=26, hour=11, minute=36, second=15, microsecond=123456, tzinfo=timezone.utc, ) old_date_format = { "Package": "test", "Packaged": "Thu Mar 30 10:48:35 2006; hornik", } assert parse_packaged_date(old_date_format) == datetime( year=2006, month=3, day=30, hour=10, minute=48, second=35, tzinfo=timezone.utc ) invalid_date_format = { "Package": "test", "Packaged": "foo", } assert parse_packaged_date(invalid_date_format) is None missing_date = { "Package": "test", } assert parse_packaged_date(missing_date) is None def test_cran_lister_cran(datadir, swh_scheduler, mocker): with open(path.join(datadir, "list-r-packages.json")) as f: cran_data = json.loads(f.read()) lister = CRANLister(swh_scheduler) mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data") mock_cran.return_value = cran_data stats = lister.run() assert stats.pages == 1 assert stats.origins == len(cran_data) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results assert len(scheduler_origins) == len(cran_data) for package_info in cran_data: origin_url, artifact_url = compute_origin_urls(package_info) filtered_origins = [o for o in scheduler_origins if o.url == origin_url] assert len(filtered_origins) == 1 assert filtered_origins[0].extra_loader_arguments == { - "artifacts": [{"url": artifact_url, "version": package_info["Version"]}] + "artifacts": [ + { + "url": artifact_url, + "version": package_info["Version"], + "package": package_info["Package"], + } + ] } filtered_origins[0].last_update == parse_packaged_date(package_info) def test_cran_lister_duplicated_origins(datadir, swh_scheduler, mocker): with open(path.join(datadir, "list-r-packages.json")) as f: cran_data = json.loads(f.read()) lister = CRANLister(swh_scheduler) mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data") mock_cran.return_value = cran_data + cran_data stats = lister.run() assert stats.pages == 1 assert stats.origins == len(cran_data) @pytest.mark.parametrize( "credentials, expected_credentials", [ (None, []), ({"key": "value"}, []), ( {"CRAN": {"cran": [{"username": "user", "password": "pass"}]}}, [{"username": "user", "password": "pass"}], ), ], ) def test_lister_cran_instantiation_with_credentials( credentials, expected_credentials, swh_scheduler ): lister = CRANLister(swh_scheduler, credentials=credentials) # Credentials are allowed in constructor assert lister.credentials == expected_credentials def test_lister_cran_from_configfile(swh_scheduler_config, mocker): load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") load_from_envvar.return_value = { "scheduler": {"cls": "local", **swh_scheduler_config}, "credentials": {}, } lister = CRANLister.from_configfile() assert lister.scheduler is not None assert lister.credentials is not None