diff --git a/swh/lister/opam/lister.py b/swh/lister/opam/lister.py index 1fbc7db..364d50d 100644 --- a/swh/lister/opam/lister.py +++ b/swh/lister/opam/lister.py @@ -1,107 +1,111 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io import logging import os from subprocess import PIPE, Popen, call import tempfile from typing import Iterator from swh.lister.pattern import StatelessLister from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType logger = logging.getLogger(__name__) PageType = str class OpamLister(StatelessLister[PageType]): """ List all repositories hosted on an opam repository. On initialisation, we create an opam root, with no ocaml compiler (no switch) as we won't need it and it's costly. In this opam root, we add a single opam repository (url) and give it a name (instance). Then, to get pages, we just ask opam to list all the packages for our opam repository in our opam root. Args: url: base URL of an opam repository (for instance https://opam.ocaml.org) instance: string identifier for the listed repository """ # Part of the lister API, that identifies this lister LISTER_NAME = "opam" def __init__( self, scheduler: SchedulerInterface, url: str, instance: str, credentials: CredentialsType = None, ): super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, ) self.env = os.environ.copy() + # Opam root folder is initialized in the :meth:`get_pages` method as no + # side-effect should happen in the constructor to ease instantiation self.opamroot = tempfile.mkdtemp(prefix="swh_opam_lister") + + def get_pages(self) -> Iterator[PageType]: + # Initialize the opam root directory with the opam instance data to list. call( [ "opam", "init", "--reinit", "--bare", "--no-setup", "--root", self.opamroot, - instance, - url, + self.instance, + self.url, ], env=self.env, ) - - def get_pages(self) -> Iterator[PageType]: + # Actually list opam instance data proc = Popen( [ "opam", "list", "--all", "--no-switch", "--repos", self.instance, "--root", self.opamroot, "--normalise", "--short", ], env=self.env, stdout=PIPE, ) if proc.stdout is not None: for line in io.TextIOWrapper(proc.stdout): yield line.rstrip("\n") def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: """Convert a page of OpamLister repositories into a list of ListedOrigins""" assert self.lister_obj.id is not None # a page is just a package name url = f"opam+{self.url}/packages/{page}/" yield ListedOrigin( lister_id=self.lister_obj.id, visit_type="opam", url=url, last_update=None, extra_loader_arguments={ "opam_root": self.opamroot, "opam_instance": self.instance, "opam_url": self.url, "opam_package": page, }, ) diff --git a/swh/lister/opam/tests/test_lister.py b/swh/lister/opam/tests/test_lister.py index 763f491..ffc4439 100644 --- a/swh/lister/opam/tests/test_lister.py +++ b/swh/lister/opam/tests/test_lister.py @@ -1,65 +1,81 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io from unittest.mock import MagicMock -from swh.lister.opam.lister import OpamLister +import pytest +from swh.lister.opam.lister import OpamLister -def test_urls(swh_scheduler, mocker): +module_name = "swh.lister.opam.lister" - instance_url = "https://opam.ocaml.org" - lister = OpamLister(swh_scheduler, url=instance_url, instance="opam") +@pytest.fixture +def mock_opam(mocker): + """Fixture to bypass the actual opam calls within the test context. + """ + # inhibits the real `subprocess.call` which prepares the required internal opam + # state + mock_init = mocker.patch(f"{module_name}.call", return_value=None) + # replaces the real Popen with a fake one (list origins command) mocked_popen = MagicMock() mocked_popen.stdout = io.BytesIO(b"bar\nbaz\nfoo\n") + mock_open = mocker.patch(f"{module_name}.Popen", return_value=mocked_popen) + return mock_init, mock_open + - # replaces the real Popen with a fake one - mocker.patch("swh.lister.opam.lister.Popen", return_value=mocked_popen) +def test_urls(swh_scheduler, mock_opam): + mock_init, mock_popen = mock_opam + + instance_url = "https://opam.ocaml.org" + + lister = OpamLister(swh_scheduler, url=instance_url, instance="opam") # call the lister and get all listed origins urls stats = lister.run() + assert mock_init.called + assert mock_popen.called + assert stats.pages == 3 assert stats.origins == 3 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results expected_urls = [ f"opam+{instance_url}/packages/bar/", f"opam+{instance_url}/packages/baz/", f"opam+{instance_url}/packages/foo/", ] result_urls = [origin.url for origin in scheduler_origins] assert expected_urls == result_urls def test_opam_binary(datadir, swh_scheduler): - instance_url = f"file://{datadir}/fake_opam_repo" lister = OpamLister(swh_scheduler, url=instance_url, instance="fake") stats = lister.run() assert stats.pages == 4 assert stats.origins == 4 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results expected_urls = [ f"opam+{instance_url}/packages/agrid/", f"opam+{instance_url}/packages/calculon/", f"opam+{instance_url}/packages/directories/", f"opam+{instance_url}/packages/ocb/", ] result_urls = [origin.url for origin in scheduler_origins] assert expected_urls == result_urls