diff --git a/PKG-INFO b/PKG-INFO index d84f6a4..704fda5 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,126 +1,126 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 1.8.0 +Version: 1.9.0 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/swh.lister.egg-info/PKG-INFO b/swh.lister.egg-info/PKG-INFO index d84f6a4..704fda5 100644 --- a/swh.lister.egg-info/PKG-INFO +++ b/swh.lister.egg-info/PKG-INFO @@ -1,126 +1,126 @@ Metadata-Version: 2.1 Name: swh.lister -Version: 1.8.0 +Version: 1.9.0 Summary: Software Heritage lister Home-page: https://forge.softwareheritage.org/diffusion/DLSGH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-lister Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-lister/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing License-File: LICENSE swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` - `swh.lister.launchpad` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, `gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py index 7d976ee..3d35829 100644 --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -1,68 +1,73 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging -from typing import Any, Iterator, Mapping +from typing import Any, Iterator, Mapping, Optional import iso8601 from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister from .tree import GNUTree logger = logging.getLogger(__name__) GNUPageType = Mapping[str, Any] class GNULister(StatelessLister[GNUPageType]): """ List all GNU projects and associated artifacts. """ LISTER_NAME = "GNU" GNU_FTP_URL = "https://ftp.gnu.org" def __init__( self, scheduler: SchedulerInterface, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, url=self.GNU_FTP_URL, instance="GNU", credentials=credentials, ) - self.gnu_tree = GNUTree(f"{self.url}/tree.json.gz") + # no side-effect calls in constructor, if extra state is needed, as preconized + # by the pattern docstring, this must happen in the get_pages method. + self.gnu_tree: Optional[GNUTree] = None def get_pages(self) -> Iterator[GNUPageType]: """ Yield a single page listing all GNU projects. """ + # first fetch the manifest to parse + self.gnu_tree = GNUTree(f"{self.url}/tree.json.gz") yield self.gnu_tree.projects def get_origins_from_page(self, page: GNUPageType) -> Iterator[ListedOrigin]: """ Iterate on all GNU projects and yield ListedOrigin instances. """ assert self.lister_obj.id is not None + assert self.gnu_tree is not None artifacts = self.gnu_tree.artifacts for project_name, project_info in page.items(): origin_url = project_info["url"] last_update = iso8601.parse_date(project_info["time_modified"]) logger.debug("Found origin %s last updated on %s", origin_url, last_update) yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="tar", last_update=last_update, extra_loader_arguments={"artifacts": artifacts[project_name]}, ) diff --git a/swh/lister/opam/lister.py b/swh/lister/opam/lister.py index 1fbc7db..486bdc2 100644 --- a/swh/lister/opam/lister.py +++ b/swh/lister/opam/lister.py @@ -1,107 +1,111 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io import logging import os from subprocess import PIPE, Popen, call -import tempfile -from typing import Iterator +from typing import Iterator, Optional from swh.lister.pattern import StatelessLister from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType logger = logging.getLogger(__name__) PageType = str class OpamLister(StatelessLister[PageType]): """ List all repositories hosted on an opam repository. On initialisation, we create an opam root, with no ocaml compiler (no switch) as we won't need it and it's costly. In this opam root, we add a single opam repository (url) and give it a name (instance). Then, to get pages, we just ask opam to list all the packages for our opam repository in our opam root. Args: url: base URL of an opam repository (for instance https://opam.ocaml.org) instance: string identifier for the listed repository """ # Part of the lister API, that identifies this lister LISTER_NAME = "opam" def __init__( self, scheduler: SchedulerInterface, url: str, - instance: str, + instance: Optional[str] = None, credentials: CredentialsType = None, + opam_root: str = "/tmp/opam/", ): super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, ) self.env = os.environ.copy() - self.opamroot = tempfile.mkdtemp(prefix="swh_opam_lister") + # Opam root folder is initialized in the :meth:`get_pages` method as no + # side-effect should happen in the constructor to ease instantiation + self.opamroot = os.path.join(opam_root, self.instance) + + def get_pages(self) -> Iterator[PageType]: + # Initialize the opam root directory with the opam instance data to list. call( [ "opam", "init", "--reinit", "--bare", "--no-setup", "--root", self.opamroot, - instance, - url, + self.instance, + self.url, ], env=self.env, ) - - def get_pages(self) -> Iterator[PageType]: + # Actually list opam instance data proc = Popen( [ "opam", "list", "--all", "--no-switch", "--repos", self.instance, "--root", self.opamroot, "--normalise", "--short", ], env=self.env, stdout=PIPE, ) if proc.stdout is not None: for line in io.TextIOWrapper(proc.stdout): yield line.rstrip("\n") def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: """Convert a page of OpamLister repositories into a list of ListedOrigins""" assert self.lister_obj.id is not None # a page is just a package name url = f"opam+{self.url}/packages/{page}/" yield ListedOrigin( lister_id=self.lister_obj.id, visit_type="opam", url=url, last_update=None, extra_loader_arguments={ "opam_root": self.opamroot, "opam_instance": self.instance, "opam_url": self.url, "opam_package": page, }, ) diff --git a/swh/lister/opam/tests/test_lister.py b/swh/lister/opam/tests/test_lister.py index 763f491..ffa281a 100644 --- a/swh/lister/opam/tests/test_lister.py +++ b/swh/lister/opam/tests/test_lister.py @@ -1,65 +1,103 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io +from tempfile import mkdtemp from unittest.mock import MagicMock -from swh.lister.opam.lister import OpamLister +import pytest +from swh.lister.opam.lister import OpamLister -def test_urls(swh_scheduler, mocker): +module_name = "swh.lister.opam.lister" - instance_url = "https://opam.ocaml.org" - lister = OpamLister(swh_scheduler, url=instance_url, instance="opam") +@pytest.fixture +def mock_opam(mocker): + """Fixture to bypass the actual opam calls within the test context. + """ + # inhibits the real `subprocess.call` which prepares the required internal opam + # state + mock_init = mocker.patch(f"{module_name}.call", return_value=None) + # replaces the real Popen with a fake one (list origins command) mocked_popen = MagicMock() mocked_popen.stdout = io.BytesIO(b"bar\nbaz\nfoo\n") + mock_open = mocker.patch(f"{module_name}.Popen", return_value=mocked_popen) + return mock_init, mock_open + + +def test_lister_opam_optional_instance(swh_scheduler): + """Instance name should be optional and default to be built out of the netloc.""" + netloc = "opam.ocaml.org" + instance_url = f"https://{netloc}" + + lister = OpamLister(swh_scheduler, url=instance_url,) + assert lister.instance == netloc + assert lister.opamroot.endswith(lister.instance) - # replaces the real Popen with a fake one - mocker.patch("swh.lister.opam.lister.Popen", return_value=mocked_popen) + +def test_urls(swh_scheduler, mock_opam): + mock_init, mock_popen = mock_opam + + instance_url = "https://opam.ocaml.org" + + lister = OpamLister( + swh_scheduler, + url=instance_url, + instance="opam", + opam_root=mkdtemp(prefix="swh_opam_lister"), + ) + assert lister.instance == "opam" # call the lister and get all listed origins urls stats = lister.run() + assert mock_init.called + assert mock_popen.called + assert stats.pages == 3 assert stats.origins == 3 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results expected_urls = [ f"opam+{instance_url}/packages/bar/", f"opam+{instance_url}/packages/baz/", f"opam+{instance_url}/packages/foo/", ] result_urls = [origin.url for origin in scheduler_origins] assert expected_urls == result_urls def test_opam_binary(datadir, swh_scheduler): - instance_url = f"file://{datadir}/fake_opam_repo" - lister = OpamLister(swh_scheduler, url=instance_url, instance="fake") + lister = OpamLister( + swh_scheduler, + url=instance_url, + instance="fake", + opam_root=mkdtemp(prefix="swh_opam_lister"), + ) stats = lister.run() assert stats.pages == 4 assert stats.origins == 4 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results expected_urls = [ f"opam+{instance_url}/packages/agrid/", f"opam+{instance_url}/packages/calculon/", f"opam+{instance_url}/packages/directories/", f"opam+{instance_url}/packages/ocb/", ] result_urls = [origin.url for origin in scheduler_origins] assert expected_urls == result_urls