Page MenuHomeSoftware Heritage

D8517.diff
No OneTemporary

D8517.diff

diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -60,6 +60,7 @@
lister.bitbucket=swh.lister.bitbucket:register
lister.bower=swh.lister.bower:register
lister.cgit=swh.lister.cgit:register
+ lister.conda=swh.lister.conda:register
lister.cran=swh.lister.cran:register
lister.crates=swh.lister.crates:register
lister.debian=swh.lister.debian:register
diff --git a/swh/lister/conda/__init__.py b/swh/lister/conda/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/conda/__init__.py
@@ -0,0 +1,124 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+"""
+Conda lister
+============
+
+Anaconda is a package manager that provides tooling for datascience.
+
+The Conda lister list `packages`_ from Anaconda `repositories`_.
+Those repositories host packages for several languages (Python, R) operating systems
+and architecture.
+Packages are grouped within free or commercial `channels`_.
+
+To instantiate a conda lister we need to give some `channel`and `arch` arguments::
+
+ lister = CondaLister(
+ scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"]
+ )
+
+The default `url` value of lister is `https://repo.anaconda.com/pkgs`. One can set another
+repository url, for example::
+
+ lister = CondaLister(
+ scheduler=swh_scheduler,
+ url="https://conda.anaconda.org",
+ channel="conda-forge",
+ archs=["linux-64"],
+ )
+
+Origins retrieving strategy
+---------------------------
+
+Each channel provides several `repodata.json`_ files that list available packages
+and related versions.
+
+Given a channel and a list of system and architecture the lister download and parse
+corresponding repodata.json.
+
+We use bz2 compressed version of repodata.json. See for example `main/linux-64`_ page
+to view available repodata files.
+
+Page listing
+------------
+
+The lister returns one page per channel / architecture that list all available package
+versions.
+
+Origins from page
+-----------------
+
+Origins urls are built following this pattern `https://anaconda.org/{channel}/{pkgname}`.
+Each origin is yield with an `artifacts` entry in `extra_loader_arguments` that list
+artifact metadata for each archived package version.
+
+Origin data example for one origin with two related versions.::
+
+ {
+ "url": "https://anaconda.org/conda-forge/lifetimes",
+ "artifacts": {
+ "linux-64/0.11.1-py36h9f0ad1d_1": {
+ "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", # noqa: B950
+ "date": "2020-07-06T12:19:36.425000+00:00",
+ "version": "0.11.1",
+ "filename": "lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2",
+ "checksums": {
+ "md5": "faa398f7ba0d60ce44aa6eeded490cee",
+ "sha256": "f82a352dfae8abceeeaa538b220fd9c5e4aa4e59092a6a6cea70b9ec0581ea03", # noqa: B950
+ },
+ },
+ "linux-64/0.11.1-py36hc560c46_1": {
+ "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36hc560c46_1.tar.bz2", # noqa: B950
+ "date": "2020-07-06T12:19:37.032000+00:00",
+ "version": "0.11.1",
+ "filename": "lifetimes-0.11.1-py36hc560c46_1.tar.bz2",
+ "checksums": {
+ "md5": "c53a689a4c5948e84211bdfc23e3fe68",
+ "sha256": "76146c2ebd6e3b65928bde53a2585287759d77beba785c0eeb889ee565c0035d", # noqa: B950
+ },
+ },
+ },
+ }
+
+Running tests
+-------------
+
+Activate the virtualenv and run from within swh-lister directory::
+
+ pytest -s -vv --log-cli-level=DEBUG swh/lister/conda/tests
+
+Testing with Docker
+-------------------
+
+Change directory to swh/docker then launch the docker environment::
+
+ docker compose up -d
+
+Then schedule a conda listing task::
+
+ docker compose exec swh-scheduler swh scheduler task add -p oneshot list-conda channel="free" archs="[linux-64, osx-64, win-64]" # noqa: B950
+
+You can follow lister execution by displaying logs of swh-lister service::
+
+ docker compose logs -f swh-lister
+
+.. _packages: https://docs.anaconda.com/anaconda/packages/pkg-docs/
+.. _Anaconda: https://anaconda.com/
+.. _repositories: https://repo.anaconda.com/pkgs/
+.. _channels: https://docs.anaconda.com/anaconda/user-guide/tasks/using-repositories/
+.. _main/linux-64: https://repo.anaconda.com/pkgs/main/linux-64/
+.. _repodata.json: https://repo.anaconda.com/pkgs/free/linux-64/repodata.json
+"""
+
+
+def register():
+ from .lister import CondaLister
+
+ return {
+ "lister": CondaLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/conda/lister.py b/swh/lister/conda/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/conda/lister.py
@@ -0,0 +1,118 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import bz2
+from collections import defaultdict
+import datetime
+import json
+import logging
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+
+import iso8601
+
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+# Aliasing the page results returned by `get_pages` method from the lister.
+CondaListerPage = Tuple[str, Dict[str, Dict[str, Any]]]
+
+
+class CondaLister(StatelessLister[CondaListerPage]):
+ """List Conda (anaconda.com) origins."""
+
+ LISTER_NAME = "conda"
+ VISIT_TYPE = "conda"
+ INSTANCE = "conda"
+ BASE_REPO_URL = "https://repo.anaconda.com/pkgs"
+ REPO_URL_PATTERN = "{url}/{channel}/{arch}/repodata.json.bz2"
+ ORIGIN_URL_PATTERN = "https://anaconda.org/{channel}/{pkgname}"
+ ARCHIVE_URL_PATTERN = "{url}/{channel}/{arch}/{filename}"
+
+ def __init__(
+ self,
+ scheduler: SchedulerInterface,
+ credentials: Optional[CredentialsType] = None,
+ url: str = BASE_REPO_URL,
+ channel: str = "",
+ archs: List = [],
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ credentials=credentials,
+ instance=self.INSTANCE,
+ url=url,
+ )
+ self.channel: str = channel
+ self.archs: List[str] = archs
+ self.packages: Dict[str, Any] = defaultdict(dict)
+ self.package_dates: Dict[str, Any] = defaultdict(list)
+
+ def get_pages(self) -> Iterator[CondaListerPage]:
+ """Yield an iterator which returns 'page'"""
+
+ for arch in self.archs:
+ repodata_url = self.REPO_URL_PATTERN.format(
+ url=self.url, channel=self.channel, arch=arch
+ )
+ response = self.http_request(url=repodata_url)
+ packages = json.loads(bz2.decompress(response.content))["packages"]
+ yield (arch, packages)
+
+ def get_origins_from_page(self, page: CondaListerPage) -> Iterator[ListedOrigin]:
+ """Iterate on all pages and yield ListedOrigin instances."""
+ assert self.lister_obj.id is not None
+ arch, packages = page
+
+ for filename, package_metadata in packages.items():
+ artifact = {
+ "filename": filename,
+ "url": self.ARCHIVE_URL_PATTERN.format(
+ url=self.url,
+ channel=self.channel,
+ filename=filename,
+ arch=arch,
+ ),
+ "version": package_metadata["version"],
+ "checksums": {},
+ }
+
+ for checksum in ("md5", "sha256"):
+ if checksum in package_metadata:
+ artifact["checksums"][checksum] = package_metadata[checksum]
+
+ version_key = (
+ f"{arch}/{package_metadata['version']}-{package_metadata['build']}"
+ )
+ self.packages[package_metadata["name"]][version_key] = artifact
+
+ package_date = None
+ if "timestamp" in package_metadata:
+ package_date = datetime.datetime.fromtimestamp(
+ package_metadata["timestamp"] / 1e3, datetime.timezone.utc
+ )
+ elif "date" in package_metadata:
+ package_date = iso8601.parse_date(package_metadata["date"])
+
+ last_update = None
+ if package_date:
+ artifact["date"] = package_date.isoformat()
+ self.package_dates[package_metadata["name"]].append(package_date)
+ last_update = max(self.package_dates[package_metadata["name"]])
+
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ visit_type=self.VISIT_TYPE,
+ url=self.ORIGIN_URL_PATTERN.format(
+ channel=self.channel, pkgname=package_metadata["name"]
+ ),
+ last_update=last_update,
+ extra_loader_arguments={
+ "artifacts": self.packages[package_metadata["name"]],
+ },
+ )
diff --git a/swh/lister/conda/tasks.py b/swh/lister/conda/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/conda/tasks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.lister.conda.lister import CondaLister
+
+
+@shared_task(name=__name__ + ".CondaListerTask")
+def list_conda(**lister_args):
+ """Lister task for Anaconda registry"""
+ return CondaLister.from_configfile(**lister_args).run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping():
+ return "OK"
diff --git a/swh/lister/conda/tests/__init__.py b/swh/lister/conda/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/conda/tests/data/https_conda.anaconda.org/conda-forge_linux-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_conda.anaconda.org/conda-forge_linux-64_repodata.json.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_linux-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_linux-64_repodata.json.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_osx-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_osx-64_repodata.json.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_win-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_free_win-64_repodata.json.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_main_linux-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_main_linux-64_repodata.json.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_pro_linux-64_repodata.json.bz2 b/swh/lister/conda/tests/data/https_repo.anaconda.com/pkgs_pro_linux-64_repodata.json.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000
GIT binary patch
literal 0
Hc$@<O00001
literal 0
Hc$@<O00001
diff --git a/swh/lister/conda/tests/test_lister.py b/swh/lister/conda/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/conda/tests/test_lister.py
@@ -0,0 +1,94 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.conda.lister import CondaLister
+
+
+def test_conda_lister_free_channel(datadir, requests_mock_datadir, swh_scheduler):
+ lister = CondaLister(
+ scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"]
+ )
+ res = lister.run()
+
+ assert res.pages == 3
+ assert res.origins == 14
+
+
+def test_conda_lister_conda_forge_channel(
+ datadir, requests_mock_datadir, swh_scheduler
+):
+ lister = CondaLister(
+ scheduler=swh_scheduler,
+ url="https://conda.anaconda.org",
+ channel="conda-forge",
+ archs=["linux-64"],
+ )
+ res = lister.run()
+
+ assert res.pages == 1
+ assert res.origins == 2
+
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+ expected_origins = [
+ {
+ "url": "https://anaconda.org/conda-forge/21cmfast",
+ "artifacts": {
+ "linux-64/3.0.2-py36h1af98f8_1": {
+ "url": "https://conda.anaconda.org/conda-forge/linux-64/21cmfast-3.0.2-py36h1af98f8_1.tar.bz2", # noqa: B950
+ "date": "2020-11-11T16:04:49.658000+00:00",
+ "version": "3.0.2",
+ "filename": "21cmfast-3.0.2-py36h1af98f8_1.tar.bz2",
+ "checksums": {
+ "md5": "d65ab674acf3b7294ebacaec05fc5b54",
+ "sha256": "1154fceeb5c4ee9bb97d245713ac21eb1910237c724d2b7103747215663273c2", # noqa: B950
+ },
+ }
+ },
+ },
+ {
+ "url": "https://anaconda.org/conda-forge/lifetimes",
+ "artifacts": {
+ "linux-64/0.11.1-py36h9f0ad1d_1": {
+ "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", # noqa: B950
+ "date": "2020-07-06T12:19:36.425000+00:00",
+ "version": "0.11.1",
+ "filename": "lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2",
+ "checksums": {
+ "md5": "faa398f7ba0d60ce44aa6eeded490cee",
+ "sha256": "f82a352dfae8abceeeaa538b220fd9c5e4aa4e59092a6a6cea70b9ec0581ea03", # noqa: B950
+ },
+ },
+ "linux-64/0.11.1-py36hc560c46_1": {
+ "url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36hc560c46_1.tar.bz2", # noqa: B950
+ "date": "2020-07-06T12:19:37.032000+00:00",
+ "version": "0.11.1",
+ "filename": "lifetimes-0.11.1-py36hc560c46_1.tar.bz2",
+ "checksums": {
+ "md5": "c53a689a4c5948e84211bdfc23e3fe68",
+ "sha256": "76146c2ebd6e3b65928bde53a2585287759d77beba785c0eeb889ee565c0035d", # noqa: B950
+ },
+ },
+ },
+ },
+ ]
+
+ assert len(scheduler_origins) == len(expected_origins)
+
+ assert [
+ (
+ scheduled.visit_type,
+ scheduled.url,
+ scheduled.extra_loader_arguments["artifacts"],
+ )
+ for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url)
+ ] == [
+ (
+ "conda",
+ expected["url"],
+ expected["artifacts"],
+ )
+ for expected in sorted(expected_origins, key=lambda expected: expected["url"])
+ ]
diff --git a/swh/lister/conda/tests/test_tasks.py b/swh/lister/conda/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/conda/tests/test_tasks.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_conda_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.conda.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_conda_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ # setup the mocked CondaLister
+ lister = mocker.patch("swh.lister.conda.tasks.CondaLister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=42, origins=42)
+ lister.run.return_value = stats
+
+ res = swh_scheduler_celery_app.send_task("swh.lister.conda.tasks.CondaListerTask")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == stats.dict()
+
+ lister.from_configfile.assert_called_once_with()
+ lister.run.assert_called_once_with()

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 30, 4:31 PM (2 h, 13 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219742

Event Timeline