diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ - `swh.lister.tuleap` - `swh.lister.gogs` - `swh.liser.fedora` +- `swh.lister.hex` Dependencies ------------ diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -87,6 +87,7 @@ lister.maven=swh.lister.maven:register lister.gogs=swh.lister.gogs:register lister.fedora=swh.lister.fedora:register + lister.hex=swh.lister.hex:register """, classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/lister/hex/__init__.py b/swh/lister/hex/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/hex/__init__.py @@ -0,0 +1,13 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import HexLister + + return { + "lister": HexLister, + "task_modules": [f"{__name__}.tasks"], + } diff --git a/swh/lister/hex/lister.py b/swh/lister/hex/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/hex/lister.py @@ -0,0 +1,130 @@ +# Copyright (C) 2021-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from dataclasses import asdict, dataclass +import logging +from typing import Any, Dict, Iterator, List +from urllib.parse import urljoin + +import iso8601 + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import CredentialsType, Lister + +logger = logging.getLogger(__name__) + +HexListerPage = List[Dict[str, Any]] + + +def get_tar_url(pkg_name: str, release_version: str): + return f"https://repo.hex.pm/tarballs/{pkg_name}-{release_version}.tar" + + +@dataclass +class HexListerState: + """The HexLister instance state. This is used for incremental listing.""" + + last_page_id: int = 1 + """Id of the last page listed on an incremental pass""" + last_pkg_name: str = "" + """Name of the last package inserted at on an incremental pass""" + + +class HexLister(Lister[HexListerState, HexListerPage]): + """List origins from the "Hex" forge.""" + + LISTER_NAME = "hex" + VISIT_TYPE = "hex" + + HEX_API_URL = "https://hex.pm/api/" + PACKAGES_PATH = "packages/" + + def __init__( + self, + scheduler: SchedulerInterface, + instance: str = "hex", + credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + url=self.HEX_API_URL, + instance=instance, + ) + # TODO: Add authentication support + + self.session.headers.update({"Accept": "application/json"}) + + def state_from_dict(self, d: Dict[str, Any]) -> HexListerState: + return HexListerState(**d) + + def state_to_dict(self, state: HexListerState) -> Dict[str, Any]: + return asdict(state) + + def get_pages(self) -> Iterator[HexListerPage]: + page_id = 1 + if self.state.last_page_id is not None: + page_id = self.state.last_page_id + + url = urljoin(self.url, self.PACKAGES_PATH) + + while page_id is not None: + body = self.http_request( + url, + params={ + "page": page_id, + "sort": "name", + }, # sort=name is actually the default + ).json() + + yield body + + page_id += 1 # Consider stopping before yielding? + + if len(body) == 0: + break # Consider stopping if number of items < 100? + + def get_origins_from_page(self, page: HexListerPage) -> Iterator[ListedOrigin]: + """Convert a page of HexLister repositories into a list of ListedOrigins""" + assert self.lister_obj.id is not None + + for pkg in page: + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=pkg["html_url"], + last_update=iso8601.parse_date(pkg["updated_at"]), + extra_loader_arguments={ + "releases": { + release["url"]: { + "package": pkg["name"], + "version": release["version"], + "tar_url": get_tar_url(pkg["name"], release["version"]), + } + for release in pkg["releases"] + } + }, + ) + + def commit_page(self, page: HexListerPage) -> None: + if len(page) == 0: + return + + last_pkg_name = page[-1]["name"] + + # incoming page should have alphabetically greater + # last package name than the one stored in the state + if last_pkg_name > self.state.last_pkg_name: + self.state.last_pkg_name = last_pkg_name + self.state.last_page_id += 1 + + def finalize(self) -> None: + scheduler_state = self.get_state_from_scheduler() + + if self.state.last_page_id > scheduler_state.last_page_id: + self.updated = True diff --git a/swh/lister/hex/tasks.py b/swh/lister/hex/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/hex/tasks.py @@ -0,0 +1,23 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict, Optional + +from celery import shared_task + +from .lister import HexLister + + +@shared_task(name=__name__ + ".FullHexRelister") +def list_hex_full( + instance: Optional[str] = None, +) -> Dict[str, int]: + """Full update of a Hex.pm instance""" + lister = HexLister.from_configfile(instance=instance) + return lister.run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping() -> str: + return "OK" diff --git a/swh/lister/hex/tests/__init__.py b/swh/lister/hex/tests/__init__.py new file mode 100644 diff --git a/swh/lister/hex/tests/data/https_hex.pm/page1.json b/swh/lister/hex/tests/data/https_hex.pm/page1.json new file mode 100644 --- /dev/null +++ b/swh/lister/hex/tests/data/https_hex.pm/page1.json @@ -0,0 +1,190 @@ +[ + { + "configs": { + "erlang.mk": "dep_aadya = hex 0.1.0", + "mix.exs": "{:aadya, \"~> 0.1.0\"}", + "rebar.config": "{aadya, \"0.1.0\"}" + }, + "docs_html_url": "https://hexdocs.pm/aadya/", + "downloads": { + "all": 4199, + "recent": 2 + }, + "html_url": "https://hex.pm/packages/aadya", + "inserted_at": "2018-03-12T02:13:42.826404Z", + "latest_stable_version": "0.1.0", + "latest_version": "0.1.0", + "meta": { + "description": "CoAP framework", + "licenses": [ + "GNU Lesser General Public License v3.0" + ], + "links": { + "GitHub": "https://gitlab.com/ahamtech/coap/aadya.git" + }, + "maintainers": [ + "Anwesh Reddy", + "Mahesh Reddy", + "Malreddy Ankanna" + ] + }, + "name": "aadya", + "releases": [ + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/aadya/releases/0.1.0", + "version": "0.1.0" + } + ], + "repository": "hexpm", + "retirements": {}, + "updated_at": "2018-03-12T02:19:58.150334Z", + "url": "https://hex.pm/api/packages/aadya" + }, + { + "configs": { + "erlang.mk": "dep_active_job = hex 0.1.1", + "mix.exs": "{:active_job, \"~> 0.1.1\"}", + "rebar.config": "{active_job, \"0.1.1\"}" + }, + "docs_html_url": null, + "downloads": { + "all": 575, + "recent": 8 + }, + "html_url": "https://hex.pm/packages/active_job", + "inserted_at": "2022-05-04T05:07:26.204862Z", + "latest_stable_version": "0.1.1", + "latest_version": "0.1.1", + "meta": { + "description": "Declare job workers that can be run by a variety of queuing backends. This plugin is a port of the Rails ActiveJob gem", + "licenses": [ + "MIT" + ], + "links": { + "GitHub": "https://github.com/chaskiq/ex-rails/active_job" + }, + "maintainers": [] + }, + "name": "active_job", + "releases": [ + { + "has_docs": false, + "inserted_at": null, + "url": "https://hex.pm/api/packages/active_job/releases/0.1.1", + "version": "0.1.1" + }, + { + "has_docs": false, + "inserted_at": null, + "url": "https://hex.pm/api/packages/active_job/releases/0.1.0", + "version": "0.1.0" + } + ], + "repository": "hexpm", + "retirements": {}, + "updated_at": "2022-06-17T07:01:32.486546Z", + "url": "https://hex.pm/api/packages/active_job" + }, + { + "configs": { + "erlang.mk": "dep_active_jorb = hex 0.1.2", + "mix.exs": "{:active_jorb, \"~> 0.1.2\"}", + "rebar.config": "{active_jorb, \"0.1.2\"}" + }, + "docs_html_url": "https://hexdocs.pm/active_jorb/", + "downloads": { + "all": 7148, + "recent": 10 + }, + "html_url": "https://hex.pm/packages/active_jorb", + "inserted_at": "2018-04-10T17:35:34.698754Z", + "latest_stable_version": "0.1.2", + "latest_version": "0.1.2", + "meta": { + "description": "A library to enqueue jobs with your Active Job job processor. You may want\nthis when strangling your Rails project.", + "licenses": [ + "MIT" + ], + "links": { + "Github": "https://github.com/PrecisionNutrition/active_jorb" + }, + "maintainers": [ + "James Herdman" + ] + }, + "name": "active_jorb", + "releases": [ + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/active_jorb/releases/0.1.2", + "version": "0.1.2" + }, + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/active_jorb/releases/0.1.1", + "version": "0.1.1" + }, + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/active_jorb/releases/0.1.0", + "version": "0.1.0" + } + ], + "repository": "hexpm", + "retirements": {}, + "updated_at": "2018-04-24T17:42:25.744971Z", + "url": "https://hex.pm/api/packages/active_jorb" + }, + { + "configs": { + "erlang.mk": "dep_acx = hex 0.0.2", + "mix.exs": "{:acx, \"~> 0.0.2\"}", + "rebar.config": "{acx, \"0.0.2\"}" + }, + "docs_html_url": "https://hexdocs.pm/acx/", + "downloads": { + "all": 4790, + "recent": 8 + }, + "html_url": "https://hex.pm/packages/acx", + "inserted_at": "2018-01-22T06:52:21.027352Z", + "latest_stable_version": "0.0.2", + "latest_version": "0.0.2", + "meta": { + "description": "A Elixir wrap for API of Acx.io exchange.", + "licenses": [ + "MIT" + ], + "links": { + "Github": "https://github.com/2pd/acx-elixir" + }, + "maintainers": [ + "Liang Shi" + ] + }, + "name": "acx", + "releases": [ + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/acx/releases/0.0.2", + "version": "0.0.2" + }, + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/acx/releases/0.0.1", + "version": "0.0.1" + } + ], + "repository": "hexpm", + "retirements": {}, + "updated_at": "2018-01-30T04:56:03.053561Z", + "url": "https://hex.pm/api/packages/acx" + } +] \ No newline at end of file diff --git a/swh/lister/hex/tests/data/https_hex.pm/page2.json b/swh/lister/hex/tests/data/https_hex.pm/page2.json new file mode 100644 --- /dev/null +++ b/swh/lister/hex/tests/data/https_hex.pm/page2.json @@ -0,0 +1,223 @@ +[ + { + "configs": { + "erlang.mk": "dep_adam7 = hex 0.4.0", + "mix.exs": "{:adam7, \"~> 0.4.0\"}", + "rebar.config": "{adam7, \"0.4.0\"}" + }, + "docs_html_url": null, + "downloads": { + "all": 12746, + "recent": 27, + "week": 10 + }, + "html_url": "https://hex.pm/packages/adam7", + "inserted_at": "2015-10-10T05:09:04.399996Z", + "latest_stable_version": "0.4.0", + "latest_version": "0.4.0", + "meta": { + "description": "Adam7 interlacing library for Elixir.\nPrimarily used for interlacing and de-interlacing image data for PNGs.", + "licenses": [ + "MIT" + ], + "links": { + "github": "https://github.com/SenecaSystems/imagineer" + }, + "maintainers": [ + "Chris Maddox" + ] + }, + "name": "adam7", + "releases": [ + { + "has_docs": false, + "inserted_at": null, + "url": "https://hex.pm/api/packages/adam7/releases/0.4.0", + "version": "0.4.0" + }, + { + "has_docs": false, + "inserted_at": null, + "url": "https://hex.pm/api/packages/adam7/releases/0.3.0", + "version": "0.3.0" + }, + { + "has_docs": false, + "inserted_at": null, + "url": "https://hex.pm/api/packages/adam7/releases/0.2.0", + "version": "0.2.0" + }, + { + "has_docs": false, + "inserted_at": null, + "url": "https://hex.pm/api/packages/adam7/releases/0.1.1", + "version": "0.1.1" + }, + { + "has_docs": false, + "inserted_at": null, + "url": "https://hex.pm/api/packages/adam7/releases/0.1.0", + "version": "0.1.0" + } + ], + "repository": "hexpm", + "retirements": {}, + "updated_at": "2015-10-10T05:09:04.400005Z", + "url": "https://hex.pm/api/packages/adam7" + }, + { + "configs": { + "erlang.mk": "dep_addressBook = hex 0.1.1", + "mix.exs": "{:addressBook, \"~> 0.1.1\"}", + "rebar.config": "{addressBook, \"0.1.1\"}" + }, + "docs_html_url": "https://hexdocs.pm/addressBook/", + "downloads": { + "all": 4871, + "recent": 8, + "week": 4 + }, + "html_url": "https://hex.pm/packages/addressBook", + "inserted_at": "2017-06-05T19:59:12.978909Z", + "latest_stable_version": "0.1.1", + "latest_version": "0.1.1", + "meta": { + "description": "Simple package for managing address book.", + "licenses": [ + "Apache 2.0" + ], + "links": { + "GitHub": "https://github.com/maxiwoj/AddressBook" + }, + "maintainers": [ + "Maksymilian Wojczuk" + ] + }, + "name": "addressBook", + "releases": [ + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/addressBook/releases/0.1.1", + "version": "0.1.1" + }, + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/addressBook/releases/0.1.0", + "version": "0.1.0" + } + ], + "repository": "hexpm", + "retirements": {}, + "updated_at": "2017-06-05T21:06:42.788652Z", + "url": "https://hex.pm/api/packages/addressBook" + }, + { + "configs": { + "erlang.mk": "dep_address_us = hex 0.4.1", + "mix.exs": "{:address_us, \"~> 0.4.1\"}", + "rebar.config": "{address_us, \"0.4.1\"}" + }, + "docs_html_url": "https://hexdocs.pm/address_us/", + "downloads": { + "all": 55337, + "day": 2, + "recent": 7105, + "week": 194 + }, + "html_url": "https://hex.pm/packages/address_us", + "inserted_at": "2014-10-10T20:24:11.000000Z", + "latest_stable_version": "0.4.1", + "latest_version": "0.4.1", + "meta": { + "description": "Library for parsing US Addresses into their individual parts.", + "licenses": [ + "Apache 2.0" + ], + "links": { + "Docs": "https://smashedtoatoms.github.io/address_us", + "GitHub": "https://github.com/smashedtoatoms/address_us" + }, + "maintainers": [] + }, + "name": "address_us", + "releases": [ + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/address_us/releases/0.4.1", + "version": "0.4.1" + }, + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/address_us/releases/0.4.0", + "version": "0.4.0" + }, + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/address_us/releases/0.2.1", + "version": "0.2.1" + }, + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/address_us/releases/0.1.1", + "version": "0.1.1" + }, + { + "has_docs": false, + "inserted_at": null, + "url": "https://hex.pm/api/packages/address_us/releases/0.1.0", + "version": "0.1.0" + } + ], + "repository": "hexpm", + "retirements": {}, + "updated_at": "2020-12-11T05:07:11.118292Z", + "url": "https://hex.pm/api/packages/address_us" + }, + { + "configs": { + "erlang.mk": "dep_alchemy_vm = hex 0.8.1", + "mix.exs": "{:alchemy_vm, \"~> 0.8.1\"}", + "rebar.config": "{alchemy_vm, \"0.8.1\"}" + }, + "docs_html_url": "https://hexdocs.pm/alchemy_vm/", + "downloads": { + "all": 2368, + "recent": 3, + "week": 2 + }, + "html_url": "https://hex.pm/packages/alchemy_vm", + "inserted_at": "2019-03-27T00:32:40.709924Z", + "latest_stable_version": "0.8.1", + "latest_version": "0.8.1", + "meta": { + "description": "A WebAssembly Virtual Machine", + "licenses": [ + "MIT" + ], + "links": { + "Elixium Network Website": "https://www.elixiumnetwork.org", + "GitHub": "https://github.com/ElixiumNetwork/AlchemyVM" + }, + "maintainers": [] + }, + "name": "alchemy_vm", + "releases": [ + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/alchemy_vm/releases/0.8.1", + "version": "0.8.1" + } + ], + "repository": "hexpm", + "retirements": {}, + "updated_at": "2019-03-27T00:32:47.822901Z", + "url": "https://hex.pm/api/packages/alchemy_vm" + } +] \ No newline at end of file diff --git a/swh/lister/hex/tests/data/https_hex.pm/page3.json b/swh/lister/hex/tests/data/https_hex.pm/page3.json new file mode 100644 --- /dev/null +++ b/swh/lister/hex/tests/data/https_hex.pm/page3.json @@ -0,0 +1,108 @@ +[ + { + "configs": { + "erlang.mk": "dep_quagga_def = hex 0.4.0", + "mix.exs": "{:quagga_def, \"~> 0.4.0\"}", + "rebar.config": "{quagga_def, \"0.4.0\"}" + }, + "docs_html_url": "https://hexdocs.pm/quagga_def/", + "downloads": { + "all": 106, + "day": 12, + "recent": 106, + "week": 22 + }, + "html_url": "https://hex.pm/packages/quagga_def", + "inserted_at": "2022-10-12T07:03:48.666872Z", + "latest_stable_version": "0.4.0", + "latest_version": "0.4.0", + "meta": { + "description": "Quagga bamboo clump convention definitions and functions", + "licenses": [ + "MIT" + ], + "links": { + "GitHub": "https://github.com/mwmiller/quagga_def" + }, + "maintainers": [] + }, + "name": "quagga_def", + "releases": [ + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/quagga_def/releases/0.4.0", + "version": "0.4.0" + }, + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/quagga_def/releases/0.3.0", + "version": "0.3.0" + }, + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/quagga_def/releases/0.2.0", + "version": "0.2.0" + }, + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/quagga_def/releases/0.1.0", + "version": "0.1.0" + } + ], + "repository": "hexpm", + "retirements": {}, + "updated_at": "2022-11-29T11:41:15.862303Z", + "url": "https://hex.pm/api/packages/quagga_def" + }, + { + "configs": { + "erlang.mk": "dep_logger_dev = hex 0.1.1", + "mix.exs": "{:logger_dev, \"~> 0.1.1\"}", + "rebar.config": "{logger_dev, \"0.1.1\"}" + }, + "docs_html_url": "https://hexdocs.pm/logger_dev/", + "downloads": { + "all": 188, + "day": 4, + "recent": 188, + "week": 48 + }, + "html_url": "https://hex.pm/packages/logger_dev", + "inserted_at": "2022-09-08T21:37:20.359224Z", + "latest_stable_version": "0.1.1", + "latest_version": "0.1.1", + "meta": { + "description": "A more readable formatter for Logger.Backends.Console", + "licenses": [ + "MIT" + ], + "links": { + "GitHub": "https://github.com/protestContest/logger_dev" + }, + "maintainers": [] + }, + "name": "logger_dev", + "releases": [ + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/logger_dev/releases/0.1.1", + "version": "0.1.1" + }, + { + "has_docs": true, + "inserted_at": null, + "url": "https://hex.pm/api/packages/logger_dev/releases/0.1.0", + "version": "0.1.0" + } + ], + "repository": "hexpm", + "retirements": {}, + "updated_at": "2022-09-09T21:00:14.993273Z", + "url": "https://hex.pm/api/packages/logger_dev" + } +] \ No newline at end of file diff --git a/swh/lister/hex/tests/test_lister.py b/swh/lister/hex/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/hex/tests/test_lister.py @@ -0,0 +1,141 @@ +import json +from pathlib import Path +from typing import List + +import pytest + +from swh.lister.hex.lister import HexLister, ListedOrigin +from swh.scheduler.interface import SchedulerInterface + + +@pytest.fixture +def hexpm_page(datadir): + def get_page(page_id: int): + # FIXME: Update the test data to match ?sort=name + text = Path(datadir, "https_hex.pm", f"page{page_id}.json").read_text() + page_result = json.loads(text) + origins = [origin["html_url"] for origin in page_result] + return origins, page_result + + return get_page + + +def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedOrigin]): + """Asserts that the two collections have the same origin URLs.""" + assert set(lister_urls) == {origin.url for origin in scheduler_origins} + + +def test_full_lister_hex( + swh_scheduler: SchedulerInterface, + requests_mock, + hexpm_page, +): + """ + Simulate a full listing of packages for hex (erlang package manager) + """ + p1_origin_urls, p1_json = hexpm_page(1) + p2_origin_urls, p2_json = hexpm_page(2) + p3_origin_urls, p3_json = hexpm_page(3) + + requests_mock.get("https://hex.pm/api/packages/?page=1", json=p1_json) + requests_mock.get("https://hex.pm/api/packages/?page=2", json=p2_json) + requests_mock.get("https://hex.pm/api/packages/?page=3", json=p3_json) + requests_mock.get("https://hex.pm/api/packages/?page=4", json=[]) + + lister = HexLister(swh_scheduler) + + stats = lister.run() + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + lister_state = lister.get_state_from_scheduler() + + assert stats.pages == 4 + assert stats.origins == 10 # 4 + 4 + 2 + 0 + + check_listed_origins( + p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins + ) + + assert lister_state.last_page_id == 4 + assert lister_state.last_pkg_name == "logger_dev" + assert lister.updated + + +def test_gogs_incremental_lister( + swh_scheduler, + requests_mock, + hexpm_page, +): + lister = HexLister(swh_scheduler) + + # First run: P1 and P2 return 4 origins each and P3 returns 0 + p1_origin_urls, p1_json = hexpm_page(1) + p2_origin_urls, p2_json = hexpm_page(2) + + requests_mock.get("https://hex.pm/api/packages/?page=1", json=p1_json) + requests_mock.get("https://hex.pm/api/packages/?page=2", json=p2_json) + requests_mock.get("https://hex.pm/api/packages/?page=3", json=[]) + + stats = lister.run() + + assert stats.pages == 3 + assert stats.origins == 8 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + lister_state = lister.get_state_from_scheduler() + assert lister_state.last_page_id == 3 + assert lister.state.last_pkg_name == "alchemy_vm" + assert lister.updated + + check_listed_origins(p1_origin_urls + p2_origin_urls, scheduler_origins) + + lister.updated = False # Reset the flag + + # Second run: P3 isn't empty anymore + p3_origin_urls, p3_json = hexpm_page(3) + + requests_mock.get("https://hex.pm/api/packages/?page=3", json=p3_json) + requests_mock.get( + "https://hex.pm/api/packages/?page=4", json=[] + ) # TODO: Try with 40x/50x here? + + stats = lister.run() + + assert stats.pages == 2 + assert stats.origins == 2 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + lister_state = lister.get_state_from_scheduler() + assert ( + lister_state.last_page_id == 4 + ) # TODO: Shouldn't this be 3 given that P4 is empty? + assert lister.state.last_pkg_name == "logger_dev" + assert lister.updated + + check_listed_origins( + p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins + ) + + lister.updated = False # Reset the flag + + # Third run: No new origins + # The lister should revisit the last page (P3) + + stats = lister.run() + + assert stats.pages == 1 + assert ( + stats.origins == 0 + ) # FIXME: inconsistent with Gogs lister. Either of them could be wrong + + lister_state = lister.get_state_from_scheduler() + assert ( + lister_state.last_page_id == 4 + ) # TODO: Shouldn't this be 3 given that P4 is empty? + assert lister.state.last_pkg_name == "logger_dev" + assert lister.updated is False # No new origins so state isn't updated + + check_listed_origins( + p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins + ) diff --git a/swh/lister/hex/tests/test_tasks.py b/swh/lister/hex/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/hex/tests/test_tasks.py @@ -0,0 +1,56 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from unittest.mock import patch + +from swh.lister.pattern import ListerStats + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.hex.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +@patch("swh.lister.hex.tasks.HexLister") +def test_full_listing(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict() + res = swh_scheduler_celery_app.send_task( + "swh.lister.hex.tasks.FullHexRelister", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + actual_kwargs = dict(**kwargs, instance=None) + + lister.from_configfile.assert_called_once_with(**actual_kwargs) + lister.run.assert_called_once_with() + + +@patch("swh.lister.hex.tasks.HexLister") +def test_full_listing_params( + lister, swh_scheduler_celery_app, swh_scheduler_celery_worker +): + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict(instance="hex.pm") + res = swh_scheduler_celery_app.send_task( + "swh.lister.hex.tasks.FullHexRelister", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -40,7 +40,7 @@ "origin_upstream": "https://git.savannah.gnu.org/cgit/guix.git/", }, "fedora": { - "url": "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases//", + "url": "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/", }, }