diff --git a/setup.py b/setup.py index 8d3d7dd..dfd7f3d 100755 --- a/setup.py +++ b/setup.py @@ -1,96 +1,97 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import open from os import path from setuptools import find_packages, setup here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( name="swh.lister", description="Software Heritage lister", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DLSGH/", packages=find_packages(), install_requires=parse_requirements() + parse_requirements("swh"), tests_require=parse_requirements("test"), setup_requires=["setuptools-scm"], extras_require={"testing": parse_requirements("test")}, use_scm_version=True, include_package_data=True, entry_points=""" [swh.cli.subcommands] lister=swh.lister.cli [swh.workers] lister.arch=swh.lister.arch:register lister.aur=swh.lister.aur:register lister.bitbucket=swh.lister.bitbucket:register lister.bower=swh.lister.bower:register lister.cgit=swh.lister.cgit:register + lister.conda=swh.lister.conda:register lister.cran=swh.lister.cran:register lister.crates=swh.lister.crates:register lister.debian=swh.lister.debian:register lister.gitea=swh.lister.gitea:register lister.github=swh.lister.github:register lister.gitlab=swh.lister.gitlab:register lister.gnu=swh.lister.gnu:register lister.golang=swh.lister.golang:register lister.launchpad=swh.lister.launchpad:register lister.npm=swh.lister.npm:register lister.opam=swh.lister.opam:register lister.packagist=swh.lister.packagist:register lister.phabricator=swh.lister.phabricator:register lister.pubdev=swh.lister.pubdev:register lister.pypi=swh.lister.pypi:register lister.sourceforge=swh.lister.sourceforge:register lister.tuleap=swh.lister.tuleap:register lister.maven=swh.lister.maven:register lister.gogs=swh.lister.gogs:register """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-lister", "Documentation": "https://docs.softwareheritage.org/devel/swh-lister/", }, ) diff --git a/swh/lister/conda/__init__.py b/swh/lister/conda/__init__.py new file mode 100644 index 0000000..3cc6dd0 --- /dev/null +++ b/swh/lister/conda/__init__.py @@ -0,0 +1,124 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +Conda lister +============ + +Anaconda is a package manager that provides tooling for datascience. + +The Conda lister list `packages`_ from Anaconda `repositories`_. +Those repositories host packages for several languages (Python, R) operating systems +and architecture. +Packages are grouped within free or commercial `channels`_. + +To instantiate a conda lister we need to give some `channel`and `arch` arguments:: + + lister = CondaLister( + scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"] + ) + +The default `url` value of lister is `https://repo.anaconda.com/pkgs`. One can set another +repository url, for example:: + + lister = CondaLister( + scheduler=swh_scheduler, + url="https://conda.anaconda.org", + channel="conda-forge", + archs=["linux-64"], + ) + +Origins retrieving strategy +--------------------------- + +Each channel provides several `repodata.json`_ files that list available packages +and related versions. + +Given a channel and a list of system and architecture the lister download and parse +corresponding repodata.json. + +We use bz2 compressed version of repodata.json. See for example `main/linux-64`_ page +to view available repodata files. + +Page listing +------------ + +The lister returns one page per channel / architecture that list all available package +versions. + +Origins from page +----------------- + +Origins urls are built following this pattern `https://anaconda.org/{channel}/{pkgname}`. +Each origin is yield with an `artifacts` entry in `extra_loader_arguments` that list +artifact metadata for each archived package version. + +Origin data example for one origin with two related versions.:: + + { + "url": "https://anaconda.org/conda-forge/lifetimes", + "artifacts": { + "linux-64/0.11.1-py36h9f0ad1d_1": { + "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", # noqa: B950 + "date": "2020-07-06T12:19:36.425000+00:00", + "version": "0.11.1", + "filename": "lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", + "checksums": { + "md5": "faa398f7ba0d60ce44aa6eeded490cee", + "sha256": "f82a352dfae8abceeeaa538b220fd9c5e4aa4e59092a6a6cea70b9ec0581ea03", # noqa: B950 + }, + }, + "linux-64/0.11.1-py36hc560c46_1": { + "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36hc560c46_1.tar.bz2", # noqa: B950 + "date": "2020-07-06T12:19:37.032000+00:00", + "version": "0.11.1", + "filename": "lifetimes-0.11.1-py36hc560c46_1.tar.bz2", + "checksums": { + "md5": "c53a689a4c5948e84211bdfc23e3fe68", + "sha256": "76146c2ebd6e3b65928bde53a2585287759d77beba785c0eeb889ee565c0035d", # noqa: B950 + }, + }, + }, + } + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory:: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/conda/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment:: + + docker compose up -d + +Then schedule a conda listing task:: + + docker compose exec swh-scheduler swh scheduler task add -p oneshot list-conda channel="free" archs="[linux-64, osx-64, win-64]" # noqa: B950 + +You can follow lister execution by displaying logs of swh-lister service:: + + docker compose logs -f swh-lister + +.. _packages: https://docs.anaconda.com/anaconda/packages/pkg-docs/ +.. _Anaconda: https://anaconda.com/ +.. _repositories: https://repo.anaconda.com/pkgs/ +.. _channels: https://docs.anaconda.com/anaconda/user-guide/tasks/using-repositories/ +.. _main/linux-64: https://repo.anaconda.com/pkgs/main/linux-64/ +.. _repodata.json: https://repo.anaconda.com/pkgs/free/linux-64/repodata.json +""" + + +def register(): + from .lister import CondaLister + + return { + "lister": CondaLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/conda/lister.py b/swh/lister/conda/lister.py new file mode 100644 index 0000000..cf91e3c --- /dev/null +++ b/swh/lister/conda/lister.py @@ -0,0 +1,118 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import bz2 +from collections import defaultdict +import datetime +import json +import logging +from typing import Any, Dict, Iterator, List, Optional, Tuple + +import iso8601 + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +CondaListerPage = Tuple[str, Dict[str, Dict[str, Any]]] + + +class CondaLister(StatelessLister[CondaListerPage]): + """List Conda (anaconda.com) origins.""" + + LISTER_NAME = "conda" + VISIT_TYPE = "conda" + INSTANCE = "conda" + BASE_REPO_URL = "https://repo.anaconda.com/pkgs" + REPO_URL_PATTERN = "{url}/{channel}/{arch}/repodata.json.bz2" + ORIGIN_URL_PATTERN = "https://anaconda.org/{channel}/{pkgname}" + ARCHIVE_URL_PATTERN = "{url}/{channel}/{arch}/{filename}" + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + url: str = BASE_REPO_URL, + channel: str = "", + archs: List = [], + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=url, + ) + self.channel: str = channel + self.archs: List[str] = archs + self.packages: Dict[str, Any] = defaultdict(dict) + self.package_dates: Dict[str, Any] = defaultdict(list) + + def get_pages(self) -> Iterator[CondaListerPage]: + """Yield an iterator which returns 'page'""" + + for arch in self.archs: + repodata_url = self.REPO_URL_PATTERN.format( + url=self.url, channel=self.channel, arch=arch + ) + response = self.http_request(url=repodata_url) + packages = json.loads(bz2.decompress(response.content))["packages"] + yield (arch, packages) + + def get_origins_from_page(self, page: CondaListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances.""" + assert self.lister_obj.id is not None + arch, packages = page + + for filename, package_metadata in packages.items(): + artifact = { + "filename": filename, + "url": self.ARCHIVE_URL_PATTERN.format( + url=self.url, + channel=self.channel, + filename=filename, + arch=arch, + ), + "version": package_metadata["version"], + "checksums": {}, + } + + for checksum in ("md5", "sha256"): + if checksum in package_metadata: + artifact["checksums"][checksum] = package_metadata[checksum] + + version_key = ( + f"{arch}/{package_metadata['version']}-{package_metadata['build']}" + ) + self.packages[package_metadata["name"]][version_key] = artifact + + package_date = None + if "timestamp" in package_metadata: + package_date = datetime.datetime.fromtimestamp( + package_metadata["timestamp"] / 1e3, datetime.timezone.utc + ) + elif "date" in package_metadata: + package_date = iso8601.parse_date(package_metadata["date"]) + + last_update = None + if package_date: + artifact["date"] = package_date.isoformat() + self.package_dates[package_metadata["name"]].append(package_date) + last_update = max(self.package_dates[package_metadata["name"]]) + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=self.ORIGIN_URL_PATTERN.format( + channel=self.channel, pkgname=package_metadata["name"] + ), + last_update=last_update, + extra_loader_arguments={ + "artifacts": self.packages[package_metadata["name"]], + }, + ) diff --git a/swh/lister/conda/tasks.py b/swh/lister/conda/tasks.py new file mode 100644 index 0000000..667a998 --- /dev/null +++ b/swh/lister/conda/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.conda.lister import CondaLister + + +@shared_task(name=__name__ + ".CondaListerTask") +def list_conda(**lister_args): + """Lister task for Anaconda registry""" + return CondaLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/conda/tests/__init__.py b/swh/lister/conda/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/conda/tests/data/https_conda.anaconda.org/conda-forge_linux-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_conda.anaconda.org/conda-forge_linux-64_repodata.json.bz2 new file mode 100644 index 0000000..253d200 Binary files /dev/null and b/swh/lister/conda/tests/data/https_conda.anaconda.org/conda-forge_linux-64_repodata.json.bz2 differ diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_linux-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_linux-64_repodata.json.bz2 new file mode 100644 index 0000000..ecd16b0 Binary files /dev/null and b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_linux-64_repodata.json.bz2 differ diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_osx-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_osx-64_repodata.json.bz2 new file mode 100644 index 0000000..e096fce Binary files /dev/null and b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_osx-64_repodata.json.bz2 differ diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_win-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_win-64_repodata.json.bz2 new file mode 100644 index 0000000..868512b Binary files /dev/null and b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_win-64_repodata.json.bz2 differ diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_main_linux-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_main_linux-64_repodata.json.bz2 new file mode 100644 index 0000000..42cb71a Binary files /dev/null and b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_main_linux-64_repodata.json.bz2 differ diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_pro_linux-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_pro_linux-64_repodata.json.bz2 new file mode 100644 index 0000000..94bc540 Binary files /dev/null and b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_pro_linux-64_repodata.json.bz2 differ diff --git a/swh/lister/conda/tests/test_lister.py b/swh/lister/conda/tests/test_lister.py new file mode 100644 index 0000000..0a67ce3 --- /dev/null +++ b/swh/lister/conda/tests/test_lister.py @@ -0,0 +1,94 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.conda.lister import CondaLister + + +def test_conda_lister_free_channel(datadir, requests_mock_datadir, swh_scheduler): + lister = CondaLister( + scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"] + ) + res = lister.run() + + assert res.pages == 3 + assert res.origins == 14 + + +def test_conda_lister_conda_forge_channel( + datadir, requests_mock_datadir, swh_scheduler +): + lister = CondaLister( + scheduler=swh_scheduler, + url="https://conda.anaconda.org", + channel="conda-forge", + archs=["linux-64"], + ) + res = lister.run() + + assert res.pages == 1 + assert res.origins == 2 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + expected_origins = [ + { + "url": "https://anaconda.org/conda-forge/21cmfast", + "artifacts": { + "linux-64/3.0.2-py36h1af98f8_1": { + "url": "https://conda.anaconda.org/conda-forge/linux-64/21cmfast-3.0.2-py36h1af98f8_1.tar.bz2", # noqa: B950 + "date": "2020-11-11T16:04:49.658000+00:00", + "version": "3.0.2", + "filename": "21cmfast-3.0.2-py36h1af98f8_1.tar.bz2", + "checksums": { + "md5": "d65ab674acf3b7294ebacaec05fc5b54", + "sha256": "1154fceeb5c4ee9bb97d245713ac21eb1910237c724d2b7103747215663273c2", # noqa: B950 + }, + } + }, + }, + { + "url": "https://anaconda.org/conda-forge/lifetimes", + "artifacts": { + "linux-64/0.11.1-py36h9f0ad1d_1": { + "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", # noqa: B950 + "date": "2020-07-06T12:19:36.425000+00:00", + "version": "0.11.1", + "filename": "lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", + "checksums": { + "md5": "faa398f7ba0d60ce44aa6eeded490cee", + "sha256": "f82a352dfae8abceeeaa538b220fd9c5e4aa4e59092a6a6cea70b9ec0581ea03", # noqa: B950 + }, + }, + "linux-64/0.11.1-py36hc560c46_1": { + "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36hc560c46_1.tar.bz2", # noqa: B950 + "date": "2020-07-06T12:19:37.032000+00:00", + "version": "0.11.1", + "filename": "lifetimes-0.11.1-py36hc560c46_1.tar.bz2", + "checksums": { + "md5": "c53a689a4c5948e84211bdfc23e3fe68", + "sha256": "76146c2ebd6e3b65928bde53a2585287759d77beba785c0eeb889ee565c0035d", # noqa: B950 + }, + }, + }, + }, + ] + + assert len(scheduler_origins) == len(expected_origins) + + assert [ + ( + scheduled.visit_type, + scheduled.url, + scheduled.extra_loader_arguments["artifacts"], + ) + for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url) + ] == [ + ( + "conda", + expected["url"], + expected["artifacts"], + ) + for expected in sorted(expected_origins, key=lambda expected: expected["url"]) + ] diff --git a/swh/lister/conda/tests/test_tasks.py b/swh/lister/conda/tests/test_tasks.py new file mode 100644 index 0000000..b9c0f6e --- /dev/null +++ b/swh/lister/conda/tests/test_tasks.py @@ -0,0 +1,31 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_conda_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.conda.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_conda_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked CondaLister + lister = mocker.patch("swh.lister.conda.tasks.CondaLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.conda.tasks.CondaListerTask") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with()