Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163832
D8517.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
17 KB
Subscribers
None
D8517.diff
View Options
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -60,6 +60,7 @@
lister.bitbucket=swh.lister.bitbucket:register
lister.bower=swh.lister.bower:register
lister.cgit=swh.lister.cgit:register
+ lister.conda=swh.lister.conda:register
lister.cran=swh.lister.cran:register
lister.crates=swh.lister.crates:register
lister.debian=swh.lister.debian:register
diff --git a/swh/lister/conda/__init__.py b/swh/lister/conda/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/conda/__init__.py
@@ -0,0 +1,124 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+"""
+Conda lister
+============
+
+Anaconda is a package manager that provides tooling for datascience.
+
+The Conda lister list `packages`_ from Anaconda `repositories`_.
+Those repositories host packages for several languages (Python, R) operating systems
+and architecture.
+Packages are grouped within free or commercial `channels`_.
+
+To instantiate a conda lister we need to give some `channel`and `arch` arguments::
+
+ lister = CondaLister(
+ scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"]
+ )
+
+The default `url` value of lister is `https://repo.anaconda.com/pkgs`. One can set another
+repository url, for example::
+
+ lister = CondaLister(
+ scheduler=swh_scheduler,
+ url="https://conda.anaconda.org",
+ channel="conda-forge",
+ archs=["linux-64"],
+ )
+
+Origins retrieving strategy
+---------------------------
+
+Each channel provides several `repodata.json`_ files that list available packages
+and related versions.
+
+Given a channel and a list of system and architecture the lister download and parse
+corresponding repodata.json.
+
+We use bz2 compressed version of repodata.json. See for example `main/linux-64`_ page
+to view available repodata files.
+
+Page listing
+------------
+
+The lister returns one page per channel / architecture that list all available package
+versions.
+
+Origins from page
+-----------------
+
+Origins urls are built following this pattern `https://anaconda.org/{channel}/{pkgname}`.
+Each origin is yield with an `artifacts` entry in `extra_loader_arguments` that list
+artifact metadata for each archived package version.
+
+Origin data example for one origin with two related versions.::
+
+ {
+ "url": "https://anaconda.org/conda-forge/lifetimes",
+ "artifacts": {
+ "linux-64/0.11.1-py36h9f0ad1d_1": {
+ "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", # noqa: B950
+ "date": "2020-07-06T12:19:36.425000+00:00",
+ "version": "0.11.1",
+ "filename": "lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2",
+ "checksums": {
+ "md5": "faa398f7ba0d60ce44aa6eeded490cee",
+ "sha256": "f82a352dfae8abceeeaa538b220fd9c5e4aa4e59092a6a6cea70b9ec0581ea03", # noqa: B950
+ },
+ },
+ "linux-64/0.11.1-py36hc560c46_1": {
+ "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36hc560c46_1.tar.bz2", # noqa: B950
+ "date": "2020-07-06T12:19:37.032000+00:00",
+ "version": "0.11.1",
+ "filename": "lifetimes-0.11.1-py36hc560c46_1.tar.bz2",
+ "checksums": {
+ "md5": "c53a689a4c5948e84211bdfc23e3fe68",
+ "sha256": "76146c2ebd6e3b65928bde53a2585287759d77beba785c0eeb889ee565c0035d", # noqa: B950
+ },
+ },
+ },
+ }
+
+Running tests
+-------------
+
+Activate the virtualenv and run from within swh-lister directory::
+
+ pytest -s -vv --log-cli-level=DEBUG swh/lister/conda/tests
+
+Testing with Docker
+-------------------
+
+Change directory to swh/docker then launch the docker environment::
+
+ docker compose up -d
+
+Then schedule a conda listing task::
+
+ docker compose exec swh-scheduler swh scheduler task add -p oneshot list-conda channel="free" archs="[linux-64, osx-64, win-64]" # noqa: B950
+
+You can follow lister execution by displaying logs of swh-lister service::
+
+ docker compose logs -f swh-lister
+
+.. _packages: https://docs.anaconda.com/anaconda/packages/pkg-docs/
+.. _Anaconda: https://anaconda.com/
+.. _repositories: https://repo.anaconda.com/pkgs/
+.. _channels: https://docs.anaconda.com/anaconda/user-guide/tasks/using-repositories/
+.. _main/linux-64: https://repo.anaconda.com/pkgs/main/linux-64/
+.. _repodata.json: https://repo.anaconda.com/pkgs/free/linux-64/repodata.json
+"""
+
+
+def register():
+ from .lister import CondaLister
+
+ return {
+ "lister": CondaLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/conda/lister.py b/swh/lister/conda/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/conda/lister.py
@@ -0,0 +1,118 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import bz2
+from collections import defaultdict
+import datetime
+import json
+import logging
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+
+import iso8601
+
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+# Aliasing the page results returned by `get_pages` method from the lister.
+CondaListerPage = Tuple[str, Dict[str, Dict[str, Any]]]
+
+
+class CondaLister(StatelessLister[CondaListerPage]):
+ """List Conda (anaconda.com) origins."""
+
+ LISTER_NAME = "conda"
+ VISIT_TYPE = "conda"
+ INSTANCE = "conda"
+ BASE_REPO_URL = "https://repo.anaconda.com/pkgs"
+ REPO_URL_PATTERN = "{url}/{channel}/{arch}/repodata.json.bz2"
+ ORIGIN_URL_PATTERN = "https://anaconda.org/{channel}/{pkgname}"
+ ARCHIVE_URL_PATTERN = "{url}/{channel}/{arch}/{filename}"
+
+ def __init__(
+ self,
+ scheduler: SchedulerInterface,
+ credentials: Optional[CredentialsType] = None,
+ url: str = BASE_REPO_URL,
+ channel: str = "",
+ archs: List = [],
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ credentials=credentials,
+ instance=self.INSTANCE,
+ url=url,
+ )
+ self.channel: str = channel
+ self.archs: List[str] = archs
+ self.packages: Dict[str, Any] = defaultdict(dict)
+ self.package_dates: Dict[str, Any] = defaultdict(list)
+
+ def get_pages(self) -> Iterator[CondaListerPage]:
+ """Yield an iterator which returns 'page'"""
+
+ for arch in self.archs:
+ repodata_url = self.REPO_URL_PATTERN.format(
+ url=self.url, channel=self.channel, arch=arch
+ )
+ response = self.http_request(url=repodata_url)
+ packages = json.loads(bz2.decompress(response.content))["packages"]
+ yield (arch, packages)
+
+ def get_origins_from_page(self, page: CondaListerPage) -> Iterator[ListedOrigin]:
+ """Iterate on all pages and yield ListedOrigin instances."""
+ assert self.lister_obj.id is not None
+ arch, packages = page
+
+ for filename, package_metadata in packages.items():
+ artifact = {
+ "filename": filename,
+ "url": self.ARCHIVE_URL_PATTERN.format(
+ url=self.url,
+ channel=self.channel,
+ filename=filename,
+ arch=arch,
+ ),
+ "version": package_metadata["version"],
+ "checksums": {},
+ }
+
+ for checksum in ("md5", "sha256"):
+ if checksum in package_metadata:
+ artifact["checksums"][checksum] = package_metadata[checksum]
+
+ version_key = (
+ f"{arch}/{package_metadata['version']}-{package_metadata['build']}"
+ )
+ self.packages[package_metadata["name"]][version_key] = artifact
+
+ package_date = None
+ if "timestamp" in package_metadata:
+ package_date = datetime.datetime.fromtimestamp(
+ package_metadata["timestamp"] / 1e3, datetime.timezone.utc
+ )
+ elif "date" in package_metadata:
+ package_date = iso8601.parse_date(package_metadata["date"])
+
+ last_update = None
+ if package_date:
+ artifact["date"] = package_date.isoformat()
+ self.package_dates[package_metadata["name"]].append(package_date)
+ last_update = max(self.package_dates[package_metadata["name"]])
+
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ visit_type=self.VISIT_TYPE,
+ url=self.ORIGIN_URL_PATTERN.format(
+ channel=self.channel, pkgname=package_metadata["name"]
+ ),
+ last_update=last_update,
+ extra_loader_arguments={
+ "artifacts": self.packages[package_metadata["name"]],
+ },
+ )
diff --git a/swh/lister/conda/tasks.py b/swh/lister/conda/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/conda/tasks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.lister.conda.lister import CondaLister
+
+
+@shared_task(name=__name__ + ".CondaListerTask")
+def list_conda(**lister_args):
+ """Lister task for Anaconda registry"""
+ return CondaLister.from_configfile(**lister_args).run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping():
+ return "OK"
diff --git a/swh/lister/conda/tests/__init__.py b/swh/lister/conda/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/conda/tests/data/https_conda.anaconda.org/conda-forge_linux-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_conda.anaconda.org/conda-forge_linux-64_repodata.json.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_linux-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_linux-64_repodata.json.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_osx-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_osx-64_repodata.json.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_win-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_win-64_repodata.json.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_main_linux-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_main_linux-64_repodata.json.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_pro_linux-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_pro_linux-64_repodata.json.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/conda/tests/test_lister.py b/swh/lister/conda/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/conda/tests/test_lister.py
@@ -0,0 +1,94 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.conda.lister import CondaLister
+
+
+def test_conda_lister_free_channel(datadir, requests_mock_datadir, swh_scheduler):
+ lister = CondaLister(
+ scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"]
+ )
+ res = lister.run()
+
+ assert res.pages == 3
+ assert res.origins == 14
+
+
+def test_conda_lister_conda_forge_channel(
+ datadir, requests_mock_datadir, swh_scheduler
+):
+ lister = CondaLister(
+ scheduler=swh_scheduler,
+ url="https://conda.anaconda.org",
+ channel="conda-forge",
+ archs=["linux-64"],
+ )
+ res = lister.run()
+
+ assert res.pages == 1
+ assert res.origins == 2
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+ expected_origins = [
+ {
+ "url": "https://anaconda.org/conda-forge/21cmfast",
+ "artifacts": {
+ "linux-64/3.0.2-py36h1af98f8_1": {
+ "url": "https://conda.anaconda.org/conda-forge/linux-64/21cmfast-3.0.2-py36h1af98f8_1.tar.bz2", # noqa: B950
+ "date": "2020-11-11T16:04:49.658000+00:00",
+ "version": "3.0.2",
+ "filename": "21cmfast-3.0.2-py36h1af98f8_1.tar.bz2",
+ "checksums": {
+ "md5": "d65ab674acf3b7294ebacaec05fc5b54",
+ "sha256": "1154fceeb5c4ee9bb97d245713ac21eb1910237c724d2b7103747215663273c2", # noqa: B950
+ },
+ }
+ },
+ },
+ {
+ "url": "https://anaconda.org/conda-forge/lifetimes",
+ "artifacts": {
+ "linux-64/0.11.1-py36h9f0ad1d_1": {
+ "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", # noqa: B950
+ "date": "2020-07-06T12:19:36.425000+00:00",
+ "version": "0.11.1",
+ "filename": "lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2",
+ "checksums": {
+ "md5": "faa398f7ba0d60ce44aa6eeded490cee",
+ "sha256": "f82a352dfae8abceeeaa538b220fd9c5e4aa4e59092a6a6cea70b9ec0581ea03", # noqa: B950
+ },
+ },
+ "linux-64/0.11.1-py36hc560c46_1": {
+ "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36hc560c46_1.tar.bz2", # noqa: B950
+ "date": "2020-07-06T12:19:37.032000+00:00",
+ "version": "0.11.1",
+ "filename": "lifetimes-0.11.1-py36hc560c46_1.tar.bz2",
+ "checksums": {
+ "md5": "c53a689a4c5948e84211bdfc23e3fe68",
+ "sha256": "76146c2ebd6e3b65928bde53a2585287759d77beba785c0eeb889ee565c0035d", # noqa: B950
+ },
+ },
+ },
+ },
+ ]
+
+ assert len(scheduler_origins) == len(expected_origins)
+
+ assert [
+ (
+ scheduled.visit_type,
+ scheduled.url,
+ scheduled.extra_loader_arguments["artifacts"],
+ )
+ for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url)
+ ] == [
+ (
+ "conda",
+ expected["url"],
+ expected["artifacts"],
+ )
+ for expected in sorted(expected_origins, key=lambda expected: expected["url"])
+ ]
diff --git a/swh/lister/conda/tests/test_tasks.py b/swh/lister/conda/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/conda/tests/test_tasks.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_conda_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.conda.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_conda_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ # setup the mocked CondaLister
+ lister = mocker.patch("swh.lister.conda.tasks.CondaLister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=42, origins=42)
+ lister.run.return_value = stats
+
+ res = swh_scheduler_celery_app.send_task("swh.lister.conda.tasks.CondaListerTask")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == stats.dict()
+
+ lister.from_configfile.assert_called_once_with()
+ lister.run.assert_called_once_with()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 30, 4:31 PM (2 h, 13 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219742
Attached To
D8517: Conda: List origins from anaconda.com, the Package, dependency and environment management for any language
Event Timeline
Log In to Comment