diff --git a/swh/lister/gnu/__init__.py b/swh/lister/gnu/__init__.py --- a/swh/lister/gnu/__init__.py +++ b/swh/lister/gnu/__init__.py @@ -1,14 +1,12 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def register(): from .lister import GNULister - from .models import GNUModel return { - "models": [GNUModel], "lister": GNULister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -1,112 +1,68 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging -from typing import Any, Dict, List +from typing import Any, Iterator, Mapping -from requests import Response +import iso8601 -from swh.lister.core.simple_lister import SimpleLister -from swh.lister.gnu.models import GNUModel -from swh.lister.gnu.tree import GNUTree -from swh.scheduler import utils +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin -logger = logging.getLogger(__name__) - - -class GNULister(SimpleLister): - MODEL = GNUModel - LISTER_NAME = "gnu" - instance = "gnu" +from ..pattern import CredentialsType, StatelessLister +from .tree import GNUTree - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.gnu_tree = GNUTree("https://ftp.gnu.org/tree.json.gz") +logger = logging.getLogger(__name__) - def task_dict(self, origin_type, origin_url, **kwargs): - """Return task format dict +GNUPageType = Mapping[str, Any] - This is overridden from the lister_base as more information is - needed for the ingestion task creation. - This creates tasks with args and kwargs set, for example: +class GNULister(StatelessLister[GNUPageType]): + """ + List all GNU projects and associated artifacts. + """ - .. code-block:: python + LISTER_NAME = "GNU" + GNU_FTP_URL = "https://ftp.gnu.org" - args: - kwargs: { - 'url': 'https://ftp.gnu.org/gnu/3dldf/', - 'artifacts': [{ - 'url': 'https://...', - 'time': '2003-12-09T21:43:20+00:00', - 'length': 128, - 'version': '1.0.1', - 'filename': 'something-1.0.1.tar.gz', - }, - ... - ] - } + def __init__( + self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, + url=self.GNU_FTP_URL, + instance="GNU", + credentials=credentials, + ) + self.gnu_tree = GNUTree(f"{self.url}/tree.json.gz") + def get_pages(self) -> Iterator[GNUPageType]: """ - artifacts = self.gnu_tree.artifacts[origin_url] - assert origin_type == "tar" - return utils.create_task_dict( - "load-archive-files", - kwargs.get("policy", "oneshot"), - url=origin_url, - artifacts=artifacts, - retries_left=3, - ) + Yield a single page listing all GNU projects. + """ + yield self.gnu_tree.projects - def safely_issue_request(self, identifier: int) -> None: - """Bypass the implementation. It's now the GNUTree which deals with - querying the gnu mirror. + def get_origins_from_page(self, page: GNUPageType) -> Iterator[ListedOrigin]: + """ + Iterate on all GNU projects and yield ListedOrigin instances. + """ + assert self.lister_obj.id is not None - As an implementation detail, we cannot change simply the base - SimpleLister as other implementation still uses it. This shall be part - of another refactoring pass. + artifacts = self.gnu_tree.artifacts - """ - return None - - def list_packages(self, response: Response) -> List[Dict[str, Any]]: - """List the actual gnu origins (package name) with their name, url and - associated tarballs. - - Args: - response: Unused - - Returns: - List of packages name, url, last modification time:: - - [ - { - 'name': '3dldf', - 'url': 'https://ftp.gnu.org/gnu/3dldf/', - 'time_modified': '2003-12-09T20:43:20+00:00' - }, - { - 'name': '8sync', - 'url': 'https://ftp.gnu.org/gnu/8sync/', - 'time_modified': '2016-12-06T02:37:10+00:00' - }, - ... - ] + for project_name, project_info in page.items(): - """ - return list(self.gnu_tree.projects.values()) + origin_url = project_info["url"] + last_update = iso8601.parse_date(project_info["time_modified"]) - def get_model_from_repo(self, repo: Dict[str, Any]) -> Dict[str, Any]: - """Transform from repository representation to model + logger.debug("Found origin %s last updated on %s", origin_url, last_update) - """ - return { - "uid": repo["url"], - "name": repo["name"], - "full_name": repo["name"], - "html_url": repo["url"], - "origin_url": repo["url"], - "time_last_updated": repo["time_modified"], - "origin_type": "tar", - } + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="tar", + last_update=last_update, + extra_loader_arguments={"artifacts": artifacts[project_name]}, + ) diff --git a/swh/lister/gnu/models.py b/swh/lister/gnu/models.py deleted file mode 100644 --- a/swh/lister/gnu/models.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (C) 2019 the Software Heritage developers -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from sqlalchemy import Column, DateTime, String - -from ..core.models import ModelBase - - -class GNUModel(ModelBase): - """a GNU repository representation - - """ - - __tablename__ = "gnu_repo" - - uid = Column(String, primary_key=True) - time_last_updated = Column(DateTime) diff --git a/swh/lister/gnu/tasks.py b/swh/lister/gnu/tasks.py --- a/swh/lister/gnu/tasks.py +++ b/swh/lister/gnu/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 the Software Heritage developers +# Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,7 +10,7 @@ @shared_task(name=__name__ + ".GNUListerTask") def list_gnu_full(**lister_args): """List lister for the GNU source code archive""" - return GNULister(**lister_args).run() + return GNULister.from_configfile(**lister_args).run().dict() @shared_task(name=__name__ + ".ping") diff --git a/swh/lister/gnu/tests/api_response.json b/swh/lister/gnu/tests/api_response.json deleted file mode 100644 --- a/swh/lister/gnu/tests/api_response.json +++ /dev/null @@ -1,37 +0,0 @@ -[{"type":"directory","name": ".","contents":[ - {"type":"file","name":".footer.shtml","size":444,"time":"1359994299"}, - {"type":"file","name":"find.txt.gz","size":261428,"time":"1557684608"}, - {"type":"directory","name":"gnu","size":12288,"time":"1556742017","contents":[]}, - {"type":"directory","name":"gnu+linux-distros","size":4096,"time":"1299783002","contents":[ - {"type":"directory","name":"ututo-e","size":4096,"time":"1487780066","contents":[ - {"type":"file","name":"README","size":48,"time":"1487780066"}, - {"type":"file","name":"index.html","size":158,"time":"1487780054"} - ]} - ]}, - {"type":"file","name":"ls-lrRt.txt.gz","size":480054,"time":"1557684607"}, - {"type":"directory","name":"mirrors","size":4096,"time":"1114010630","contents":[ - {"type":"directory","name":"dynebolic","size":4096,"time":"1317827602","contents":[ - {"type":"file","name":"MOVED_TO_mirror.fsf.org_dynebolic","size":0,"time":"1317826935"}, - {"type":"file","name":"index.html","size":107,"time":"1317827601"} - ]} - ]}, - {"type":"link","name":"non-gnu","target":"gnu/non-gnu","size":11,"time":"1082055542","contents":[]}, - {"type":"directory","name":"old-gnu","size":4096,"time":"1548360019","contents":[]}, - {"type":"link","name":"pub","target":".","size":1,"time":"1060090003","contents":[]}, - {"type":"directory","name":"savannah","size":4096,"time":"1194544006","contents":[ - {"type":"file","name":"README","size":473,"time":"1143758028"} - ]}, - {"type":"directory","name":"third-party","size":4096,"time":"1059825710","contents":[ - {"type":"file","name":"README","size":374,"time":"983824071"} - ]}, - {"type":"directory","name":"tmp","size":4096,"time":"1239072509","contents":[ - ]}, - {"type":"file","name":"tree.json.gz","size":0,"time":"1557684608"}, - {"type":"directory","name":"video","size":4096,"time":"1367963189","contents":[ - {"type":"file","name":".bash_history","size":27,"time":"1307027604"}, - {"type":"file","name":"stallmanupv.ogg.sig","size":536,"time":"1299776853"} - ]}, - {"type":"file","name":"welcome.msg","size":2830,"time":"1545163301"} -]}, -{"type":"report","directories":2743,"files":63983} -] diff --git a/swh/lister/gnu/tests/conftest.py b/swh/lister/gnu/tests/conftest.py deleted file mode 100644 --- a/swh/lister/gnu/tests/conftest.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (C) 2019-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import pytest - - -@pytest.fixture -def lister_under_test(): - return "gnu" - - -@pytest.fixture -def lister_gnu(swh_lister): - for task_type in [ - { - "type": "load-archive-files", - "description": "Load archive repository", - "backend_name": "swh.loader.packages.tasks.LoadArchive", - "default_interval": "1 day", - }, - ]: - swh_lister.scheduler.create_task_type(task_type) - - return swh_lister diff --git a/swh/lister/gnu/tests/find_tarballs_output.json b/swh/lister/gnu/tests/find_tarballs_output.json deleted file mode 100644 --- a/swh/lister/gnu/tests/find_tarballs_output.json +++ /dev/null @@ -1,182 +0,0 @@ -[ - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.1.tar.bz2", - "date": "1495205979" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.1.tar.gz", - "date": "1495205967" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.12-f39e-dirty.tar.gz", - "date": "1494994222" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.3.tar.bz2", - "date": "1520284021" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.3.tar.gz", - "date": "1520284007" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.4.tar.bz2", - "date": "1521742071" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.4.tar.gz", - "date": "1521742057" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.5.tar.bz2", - "date": "1525717261" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.5.tar.gz", - "date": "1525717246" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.1.tar.bz2", - "date": "1546205569" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.1.tar.gz", - "date": "1546205555" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.tar.bz2", - "date": "1546205025" - }, - { - "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.tar.gz", - "date": "1546205012" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_0-src.zip", - "date": "898422900" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_2-src.zip", - "date": "920018269" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_3-src.zip", - "date": "936750503" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_4-src.tar.gz", - "date": "944290190" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_5-src.tar.gz", - "date": "944600462" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_6-src.tar.gz", - "date": "952156231" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_7-src.tar.gz", - "date": "952313061" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_1_0-src.tar.gz", - "date": "969299378" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_0beta-src.tar.gz", - "date": "977027031" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_1-src.tar.gz", - "date": "981323331" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_2-src.tar.gz", - "date": "981570576" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_3-src.tar.gz", - "date": "982656672" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_4-src.tar.gz", - "date": "1007952574" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_5-src.tar.gz", - "date": "1008502483" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_6-src.tar.gz", - "date": "1012641285" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-3.6.2.tar.gz", - "date": "869814000" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.0.tar.gz", - "date": "898422900" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.2.tar.gz", - "date": "920018202" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.3.tar.gz", - "date": "936750512" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.4.tar.gz", - "date": "944290148" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.5.tar.gz", - "date": "944599461" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.6.tar.gz", - "date": "952156235" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.7.tar.gz", - "date": "952313085" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.1.0.tar.gz", - "date": "969299287" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.0beta.tar.gz", - "date": "977027108" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.1.tar.gz", - "date": "981323501" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.2.tar.gz", - "date": "981562809" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.3.tar.gz", - "date": "982657006" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.4.tar.gz", - "date": "1007952745" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.5.tar.gz", - "date": "1008466945" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.6.tar.gz", - "date": "1012641715" - }, - { - "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.7.tar.gz", - "date": "1070057764" - } - ] \ No newline at end of file diff --git a/swh/lister/gnu/tests/test_lister.py b/swh/lister/gnu/tests/test_lister.py --- a/swh/lister/gnu/tests/test_lister.py +++ b/swh/lister/gnu/tests/test_lister.py @@ -1,47 +1,36 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import logging +from ..lister import GNULister -logger = logging.getLogger(__name__) +def test_gnu_lister(swh_scheduler, requests_mock_datadir): + lister = GNULister(scheduler=swh_scheduler) -def test_gnu_lister(lister_gnu, requests_mock_datadir): - lister_gnu.run() + stats = lister.run() - r = lister_gnu.scheduler.search_tasks(task_type="load-archive-files") - assert len(r) == 383 + assert stats.pages == 1 + assert stats.origins == 383 - for row in r: - assert row["type"] == "load-archive-files" - # arguments check - args = row["arguments"]["args"] - assert len(args) == 0 + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - # kwargs - kwargs = row["arguments"]["kwargs"] - assert set(kwargs.keys()) == {"url", "artifacts"} + assert len(scheduler_origins) == stats.origins - url = kwargs["url"] - assert url.startswith("https://ftp.gnu.org") + for origin in scheduler_origins: + assert origin.url.startswith(GNULister.GNU_FTP_URL) + assert origin.last_update is not None + assert "artifacts" in origin.extra_loader_arguments + assert len(origin.extra_loader_arguments["artifacts"]) > 0 - url_suffix = url.split("https://ftp.gnu.org")[1] - assert "gnu" in url_suffix or "old-gnu" in url_suffix - artifacts = kwargs["artifacts"] - # check the artifact's structure - artifact = artifacts[0] - assert set(artifact.keys()) == {"url", "length", "time", "filename", "version"} - - for artifact in artifacts: - logger.debug(artifact) - # 'time' is an isoformat string now - for key in ["url", "time", "filename", "version"]: - assert isinstance(artifact[key], str) - assert isinstance(artifact["length"], int) - - assert row["policy"] == "oneshot" - assert row["priority"] is None - assert row["retries_left"] == 3 +def test_gnu_lister_from_configfile(swh_scheduler_config, mocker): + load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") + load_from_envvar.return_value = { + "scheduler": {"cls": "local", **swh_scheduler_config}, + "credentials": {}, + } + lister = GNULister.from_configfile() + assert lister.scheduler is not None + assert lister.credentials is not None diff --git a/swh/lister/gnu/tests/test_tasks.py b/swh/lister/gnu/tests/test_tasks.py --- a/swh/lister/gnu/tests/test_tasks.py +++ b/swh/lister/gnu/tests/test_tasks.py @@ -1,9 +1,9 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from unittest.mock import patch +from swh.lister.pattern import ListerStats def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): @@ -14,17 +14,17 @@ assert res.result == "OK" -@patch("swh.lister.gnu.tasks.GNULister") -def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): - # setup the mocked GNULister - lister.return_value = lister - lister.run.return_value = None +def test_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + lister = mocker.patch("swh.lister.gnu.tasks.GNULister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=1, origins=300) + lister.run.return_value = stats res = swh_scheduler_celery_app.send_task("swh.lister.gnu.tasks.GNUListerTask") assert res res.wait() assert res.successful() + assert res.result == stats.dict() - lister.assert_called_once_with() - lister.db_last_index.assert_not_called() + lister.from_configfile.assert_called_once_with() lister.run.assert_called_once_with()