diff --git a/swh/lister/conda/lister.py b/swh/lister/conda/lister.py index cf91e3c..eddc15d 100644 --- a/swh/lister/conda/lister.py +++ b/swh/lister/conda/lister.py @@ -1,118 +1,123 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import bz2 from collections import defaultdict import datetime import json import logging from typing import Any, Dict, Iterator, List, Optional, Tuple import iso8601 from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin from ..pattern import CredentialsType, StatelessLister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. CondaListerPage = Tuple[str, Dict[str, Dict[str, Any]]] class CondaLister(StatelessLister[CondaListerPage]): """List Conda (anaconda.com) origins.""" LISTER_NAME = "conda" VISIT_TYPE = "conda" INSTANCE = "conda" BASE_REPO_URL = "https://repo.anaconda.com/pkgs" REPO_URL_PATTERN = "{url}/{channel}/{arch}/repodata.json.bz2" ORIGIN_URL_PATTERN = "https://anaconda.org/{channel}/{pkgname}" ARCHIVE_URL_PATTERN = "{url}/{channel}/{arch}/{filename}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, url: str = BASE_REPO_URL, channel: str = "", archs: List = [], ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=url, ) self.channel: str = channel self.archs: List[str] = archs self.packages: Dict[str, Any] = defaultdict(dict) self.package_dates: Dict[str, Any] = defaultdict(list) def get_pages(self) -> Iterator[CondaListerPage]: """Yield an iterator which returns 'page'""" for arch in self.archs: repodata_url = self.REPO_URL_PATTERN.format( url=self.url, channel=self.channel, arch=arch ) response = self.http_request(url=repodata_url) - packages = json.loads(bz2.decompress(response.content))["packages"] + packages: Dict[str, Any] = json.loads(bz2.decompress(response.content))[ + "packages" + ] yield (arch, packages) def get_origins_from_page(self, page: CondaListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances.""" assert self.lister_obj.id is not None arch, packages = page for filename, package_metadata in packages.items(): - artifact = { + version_key = ( + f"{arch}/{package_metadata['version']}-{package_metadata['build']}" + ) + + artifact: Dict[str, Any] = { "filename": filename, "url": self.ARCHIVE_URL_PATTERN.format( url=self.url, channel=self.channel, filename=filename, arch=arch, ), - "version": package_metadata["version"], + "version": version_key, "checksums": {}, } for checksum in ("md5", "sha256"): if checksum in package_metadata: artifact["checksums"][checksum] = package_metadata[checksum] - version_key = ( - f"{arch}/{package_metadata['version']}-{package_metadata['build']}" - ) self.packages[package_metadata["name"]][version_key] = artifact package_date = None if "timestamp" in package_metadata: package_date = datetime.datetime.fromtimestamp( package_metadata["timestamp"] / 1e3, datetime.timezone.utc ) elif "date" in package_metadata: package_date = iso8601.parse_date(package_metadata["date"]) last_update = None if package_date: artifact["date"] = package_date.isoformat() self.package_dates[package_metadata["name"]].append(package_date) last_update = max(self.package_dates[package_metadata["name"]]) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, url=self.ORIGIN_URL_PATTERN.format( channel=self.channel, pkgname=package_metadata["name"] ), last_update=last_update, extra_loader_arguments={ - "artifacts": self.packages[package_metadata["name"]], + "artifacts": [ + v for k, v in self.packages[package_metadata["name"]].items() + ], }, ) diff --git a/swh/lister/conda/tests/test_lister.py b/swh/lister/conda/tests/test_lister.py index 244d61a..dd01064 100644 --- a/swh/lister/conda/tests/test_lister.py +++ b/swh/lister/conda/tests/test_lister.py @@ -1,94 +1,94 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.lister.conda.lister import CondaLister def test_conda_lister_free_channel(datadir, requests_mock_datadir, swh_scheduler): lister = CondaLister( scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"] ) res = lister.run() assert res.pages == 3 assert res.origins == 11 def test_conda_lister_conda_forge_channel( datadir, requests_mock_datadir, swh_scheduler ): lister = CondaLister( scheduler=swh_scheduler, url="https://conda.anaconda.org", channel="conda-forge", archs=["linux-64"], ) res = lister.run() assert res.pages == 1 assert res.origins == 2 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results expected_origins = [ { "url": "https://anaconda.org/conda-forge/21cmfast", - "artifacts": { - "linux-64/3.0.2-py36h1af98f8_1": { + "artifacts": [ + { "url": "https://conda.anaconda.org/conda-forge/linux-64/21cmfast-3.0.2-py36h1af98f8_1.tar.bz2", # noqa: B950 "date": "2020-11-11T16:04:49.658000+00:00", - "version": "3.0.2", + "version": "linux-64/3.0.2-py36h1af98f8_1", "filename": "21cmfast-3.0.2-py36h1af98f8_1.tar.bz2", "checksums": { "md5": "d65ab674acf3b7294ebacaec05fc5b54", "sha256": "1154fceeb5c4ee9bb97d245713ac21eb1910237c724d2b7103747215663273c2", # noqa: B950 }, } - }, + ], }, { "url": "https://anaconda.org/conda-forge/lifetimes", - "artifacts": { - "linux-64/0.11.1-py36h9f0ad1d_1": { + "artifacts": [ + { "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", # noqa: B950 "date": "2020-07-06T12:19:36.425000+00:00", - "version": "0.11.1", + "version": "linux-64/0.11.1-py36h9f0ad1d_1", "filename": "lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", "checksums": { "md5": "faa398f7ba0d60ce44aa6eeded490cee", "sha256": "f82a352dfae8abceeeaa538b220fd9c5e4aa4e59092a6a6cea70b9ec0581ea03", # noqa: B950 }, }, - "linux-64/0.11.1-py36hc560c46_1": { + { "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36hc560c46_1.tar.bz2", # noqa: B950 "date": "2020-07-06T12:19:37.032000+00:00", - "version": "0.11.1", + "version": "linux-64/0.11.1-py36hc560c46_1", "filename": "lifetimes-0.11.1-py36hc560c46_1.tar.bz2", "checksums": { "md5": "c53a689a4c5948e84211bdfc23e3fe68", "sha256": "76146c2ebd6e3b65928bde53a2585287759d77beba785c0eeb889ee565c0035d", # noqa: B950 }, }, - }, + ], }, ] assert len(scheduler_origins) == len(expected_origins) assert [ ( scheduled.visit_type, scheduled.url, scheduled.extra_loader_arguments["artifacts"], ) for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url) ] == [ ( "conda", expected["url"], expected["artifacts"], ) for expected in sorted(expected_origins, key=lambda expected: expected["url"]) ]