diff --git a/swh/lister/conda/lister.py b/swh/lister/conda/lister.py index eddc15d..ab0190f 100644 --- a/swh/lister/conda/lister.py +++ b/swh/lister/conda/lister.py @@ -1,123 +1,123 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import bz2 from collections import defaultdict import datetime import json import logging from typing import Any, Dict, Iterator, List, Optional, Tuple import iso8601 from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. CondaListerPage = Tuple[str, Dict[str, Dict[str, Any]]] class CondaLister(StatelessLister[CondaListerPage]): """List Conda (anaconda.com) origins.""" LISTER_NAME = "conda" VISIT_TYPE = "conda" INSTANCE = "conda" BASE_REPO_URL = "https://repo.anaconda.com/pkgs" REPO_URL_PATTERN = "{url}/{channel}/{arch}/repodata.json.bz2" ORIGIN_URL_PATTERN = "https://anaconda.org/{channel}/{pkgname}" ARCHIVE_URL_PATTERN = "{url}/{channel}/{arch}/{filename}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, url: str = BASE_REPO_URL, channel: str = "", archs: List = [], ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=url, ) self.channel: str = channel self.archs: List[str] = archs self.packages: Dict[str, Any] = defaultdict(dict) self.package_dates: Dict[str, Any] = defaultdict(list) def get_pages(self) -> Iterator[CondaListerPage]: """Yield an iterator which returns 'page'""" for arch in self.archs: repodata_url = self.REPO_URL_PATTERN.format( url=self.url, channel=self.channel, arch=arch ) response = self.http_request(url=repodata_url) packages: Dict[str, Any] = json.loads(bz2.decompress(response.content))[ "packages" ] yield (arch, packages) def get_origins_from_page(self, page: CondaListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None arch, packages = page + package_names = set() for filename, package_metadata in packages.items(): + package_names.add(package_metadata["name"]) version_key = ( f"{arch}/{package_metadata['version']}-{package_metadata['build']}" ) artifact: Dict[str, Any] = { "filename": filename, "url": self.ARCHIVE_URL_PATTERN.format( url=self.url, channel=self.channel, filename=filename, arch=arch, ), "version": version_key, "checksums": {}, } for checksum in ("md5", "sha256"): if checksum in package_metadata: artifact["checksums"][checksum] = package_metadata[checksum] self.packages[package_metadata["name"]][version_key] = artifact package_date = None if "timestamp" in package_metadata: package_date = datetime.datetime.fromtimestamp( package_metadata["timestamp"] / 1e3, datetime.timezone.utc ) elif "date" in package_metadata: package_date = iso8601.parse_date(package_metadata["date"]) - last_update = None if package_date: artifact["date"] = package_date.isoformat() self.package_dates[package_metadata["name"]].append(package_date) - last_update = max(self.package_dates[package_metadata["name"]]) + for package_name in package_names: + package_dates = self.package_dates[package_name] yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=self.ORIGIN_URL_PATTERN.format( - channel=self.channel, pkgname=package_metadata["name"] + channel=self.channel, pkgname=package_name ), - last_update=last_update, + last_update=max(package_dates, default=None), extra_loader_arguments={ - "artifacts": [ - v for k, v in self.packages[package_metadata["name"]].items() - ], + "artifacts": list(self.packages[package_name].values()) }, ) diff --git a/swh/lister/conda/tests/test_lister.py b/swh/lister/conda/tests/test_lister.py index dd01064..49580ab 100644 --- a/swh/lister/conda/tests/test_lister.py +++ b/swh/lister/conda/tests/test_lister.py @@ -1,94 +1,119 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.lister.conda.lister import CondaLister - - -def test_conda_lister_free_channel(datadir, requests_mock_datadir, swh_scheduler): - lister = CondaLister( - scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"] - ) - res = lister.run() - - assert res.pages == 3 - assert res.origins == 11 - - -def test_conda_lister_conda_forge_channel( - datadir, requests_mock_datadir, swh_scheduler -): - lister = CondaLister( - scheduler=swh_scheduler, - url="https://conda.anaconda.org", - channel="conda-forge", - archs=["linux-64"], - ) - res = lister.run() +import pytest - assert res.pages == 1 - assert res.origins == 2 +from swh.lister.conda.lister import CondaLister - scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - expected_origins = [ +@pytest.fixture +def expected_origins(): + return [ { "url": "https://anaconda.org/conda-forge/21cmfast", "artifacts": [ { "url": "https://conda.anaconda.org/conda-forge/linux-64/21cmfast-3.0.2-py36h1af98f8_1.tar.bz2", # noqa: B950 "date": "2020-11-11T16:04:49.658000+00:00", "version": "linux-64/3.0.2-py36h1af98f8_1", "filename": "21cmfast-3.0.2-py36h1af98f8_1.tar.bz2", "checksums": { "md5": "d65ab674acf3b7294ebacaec05fc5b54", "sha256": "1154fceeb5c4ee9bb97d245713ac21eb1910237c724d2b7103747215663273c2", # noqa: B950 }, } ], }, { "url": "https://anaconda.org/conda-forge/lifetimes", "artifacts": [ { "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", # noqa: B950 "date": "2020-07-06T12:19:36.425000+00:00", "version": "linux-64/0.11.1-py36h9f0ad1d_1", "filename": "lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", "checksums": { "md5": "faa398f7ba0d60ce44aa6eeded490cee", "sha256": "f82a352dfae8abceeeaa538b220fd9c5e4aa4e59092a6a6cea70b9ec0581ea03", # noqa: B950 }, }, { "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36hc560c46_1.tar.bz2", # noqa: B950 "date": "2020-07-06T12:19:37.032000+00:00", "version": "linux-64/0.11.1-py36hc560c46_1", "filename": "lifetimes-0.11.1-py36hc560c46_1.tar.bz2", "checksums": { "md5": "c53a689a4c5948e84211bdfc23e3fe68", "sha256": "76146c2ebd6e3b65928bde53a2585287759d77beba785c0eeb889ee565c0035d", # noqa: B950 }, }, ], }, ] + +def test_conda_lister_free_channel(datadir, requests_mock_datadir, swh_scheduler): + lister = CondaLister( + scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"] + ) + res = lister.run() + + assert res.pages == 3 + assert res.origins == 11 + + +def test_conda_lister_conda_forge_channel( + requests_mock_datadir, swh_scheduler, expected_origins +): + lister = CondaLister( + scheduler=swh_scheduler, + url="https://conda.anaconda.org", + channel="conda-forge", + archs=["linux-64"], + ) + res = lister.run() + + assert res.pages == 1 + assert res.origins == 2 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == len(expected_origins) assert [ ( scheduled.visit_type, scheduled.url, scheduled.extra_loader_arguments["artifacts"], ) for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url) ] == [ ( "conda", expected["url"], expected["artifacts"], ) for expected in sorted(expected_origins, key=lambda expected: expected["url"]) ] + + +def test_conda_lister_number_of_yielded_origins( + requests_mock_datadir, swh_scheduler, expected_origins +): + """Check that a single ListedOrigin instance is sent by expected origins.""" + lister = CondaLister( + scheduler=swh_scheduler, + url="https://conda.anaconda.org", + channel="conda-forge", + archs=["linux-64"], + ) + + listed_origins = [] + for page in lister.get_pages(): + listed_origins += list(lister.get_origins_from_page(page)) + + assert sorted([listed_origin.url for listed_origin in listed_origins]) == sorted( + [origin["url"] for origin in expected_origins] + )