Page MenuHomeSoftware Heritage

D4969.diff
No OneTemporary

D4969.diff

diff --git a/swh/lister/gnu/__init__.py b/swh/lister/gnu/__init__.py
--- a/swh/lister/gnu/__init__.py
+++ b/swh/lister/gnu/__init__.py
@@ -1,14 +1,12 @@
-# Copyright (C) 2019 the Software Heritage developers
+# Copyright (C) 2019-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .lister import GNULister
- from .models import GNUModel
return {
- "models": [GNUModel],
"lister": GNULister,
"task_modules": ["%s.tasks" % __name__],
}
diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py
--- a/swh/lister/gnu/lister.py
+++ b/swh/lister/gnu/lister.py
@@ -1,112 +1,68 @@
-# Copyright (C) 2019 the Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
-from typing import Any, Dict, List
+from typing import Any, Iterator, Mapping
-from requests import Response
+import iso8601
-from swh.lister.core.simple_lister import SimpleLister
-from swh.lister.gnu.models import GNUModel
-from swh.lister.gnu.tree import GNUTree
-from swh.scheduler import utils
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
-logger = logging.getLogger(__name__)
-
-
-class GNULister(SimpleLister):
- MODEL = GNUModel
- LISTER_NAME = "gnu"
- instance = "gnu"
+from ..pattern import CredentialsType, StatelessLister
+from .tree import GNUTree
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- self.gnu_tree = GNUTree("https://ftp.gnu.org/tree.json.gz")
+logger = logging.getLogger(__name__)
- def task_dict(self, origin_type, origin_url, **kwargs):
- """Return task format dict
+GNUPageType = Mapping[str, Any]
- This is overridden from the lister_base as more information is
- needed for the ingestion task creation.
- This creates tasks with args and kwargs set, for example:
+class GNULister(StatelessLister[GNUPageType]):
+ """
+ List all GNU projects and associated artifacts.
+ """
- .. code-block:: python
+ LISTER_NAME = "GNU"
+ GNU_FTP_URL = "https://ftp.gnu.org"
- args:
- kwargs: {
- 'url': 'https://ftp.gnu.org/gnu/3dldf/',
- 'artifacts': [{
- 'url': 'https://...',
- 'time': '2003-12-09T21:43:20+00:00',
- 'length': 128,
- 'version': '1.0.1',
- 'filename': 'something-1.0.1.tar.gz',
- },
- ...
- ]
- }
+ def __init__(
+ self, scheduler: SchedulerInterface, credentials: CredentialsType = None,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ url=self.GNU_FTP_URL,
+ instance="GNU",
+ credentials=credentials,
+ )
+ self.gnu_tree = GNUTree(f"{self.url}/tree.json.gz")
+ def get_pages(self) -> Iterator[GNUPageType]:
"""
- artifacts = self.gnu_tree.artifacts[origin_url]
- assert origin_type == "tar"
- return utils.create_task_dict(
- "load-archive-files",
- kwargs.get("policy", "oneshot"),
- url=origin_url,
- artifacts=artifacts,
- retries_left=3,
- )
+ Yield a single page listing all GNU projects.
+ """
+ yield self.gnu_tree.projects
- def safely_issue_request(self, identifier: int) -> None:
- """Bypass the implementation. It's now the GNUTree which deals with
- querying the gnu mirror.
+ def get_origins_from_page(self, page: GNUPageType) -> Iterator[ListedOrigin]:
+ """
+ Iterate on all GNU projects and yield ListedOrigin instances.
+ """
+ assert self.lister_obj.id is not None
- As an implementation detail, we cannot change simply the base
- SimpleLister as other implementation still uses it. This shall be part
- of another refactoring pass.
+ artifacts = self.gnu_tree.artifacts
- """
- return None
-
- def list_packages(self, response: Response) -> List[Dict[str, Any]]:
- """List the actual gnu origins (package name) with their name, url and
- associated tarballs.
-
- Args:
- response: Unused
-
- Returns:
- List of packages name, url, last modification time::
-
- [
- {
- 'name': '3dldf',
- 'url': 'https://ftp.gnu.org/gnu/3dldf/',
- 'time_modified': '2003-12-09T20:43:20+00:00'
- },
- {
- 'name': '8sync',
- 'url': 'https://ftp.gnu.org/gnu/8sync/',
- 'time_modified': '2016-12-06T02:37:10+00:00'
- },
- ...
- ]
+ for project_name, project_info in page.items():
- """
- return list(self.gnu_tree.projects.values())
+ origin_url = project_info["url"]
+ last_update = iso8601.parse_date(project_info["time_modified"])
- def get_model_from_repo(self, repo: Dict[str, Any]) -> Dict[str, Any]:
- """Transform from repository representation to model
+ logger.debug("Found origin %s last updated on %s", origin_url, last_update)
- """
- return {
- "uid": repo["url"],
- "name": repo["name"],
- "full_name": repo["name"],
- "html_url": repo["url"],
- "origin_url": repo["url"],
- "time_last_updated": repo["time_modified"],
- "origin_type": "tar",
- }
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=origin_url,
+ visit_type="tar",
+ last_update=last_update,
+ extra_loader_arguments={"artifacts": artifacts[project_name]},
+ )
diff --git a/swh/lister/gnu/models.py b/swh/lister/gnu/models.py
deleted file mode 100644
--- a/swh/lister/gnu/models.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (C) 2019 the Software Heritage developers
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-from sqlalchemy import Column, DateTime, String
-
-from ..core.models import ModelBase
-
-
-class GNUModel(ModelBase):
- """a GNU repository representation
-
- """
-
- __tablename__ = "gnu_repo"
-
- uid = Column(String, primary_key=True)
- time_last_updated = Column(DateTime)
diff --git a/swh/lister/gnu/tasks.py b/swh/lister/gnu/tasks.py
--- a/swh/lister/gnu/tasks.py
+++ b/swh/lister/gnu/tasks.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019 the Software Heritage developers
+# Copyright (C) 2019-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -10,7 +10,7 @@
@shared_task(name=__name__ + ".GNUListerTask")
def list_gnu_full(**lister_args):
"""List lister for the GNU source code archive"""
- return GNULister(**lister_args).run()
+ return GNULister.from_configfile(**lister_args).run().dict()
@shared_task(name=__name__ + ".ping")
diff --git a/swh/lister/gnu/tests/api_response.json b/swh/lister/gnu/tests/api_response.json
deleted file mode 100644
--- a/swh/lister/gnu/tests/api_response.json
+++ /dev/null
@@ -1,37 +0,0 @@
-[{"type":"directory","name": ".","contents":[
- {"type":"file","name":".footer.shtml","size":444,"time":"1359994299"},
- {"type":"file","name":"find.txt.gz","size":261428,"time":"1557684608"},
- {"type":"directory","name":"gnu","size":12288,"time":"1556742017","contents":[]},
- {"type":"directory","name":"gnu+linux-distros","size":4096,"time":"1299783002","contents":[
- {"type":"directory","name":"ututo-e","size":4096,"time":"1487780066","contents":[
- {"type":"file","name":"README","size":48,"time":"1487780066"},
- {"type":"file","name":"index.html","size":158,"time":"1487780054"}
- ]}
- ]},
- {"type":"file","name":"ls-lrRt.txt.gz","size":480054,"time":"1557684607"},
- {"type":"directory","name":"mirrors","size":4096,"time":"1114010630","contents":[
- {"type":"directory","name":"dynebolic","size":4096,"time":"1317827602","contents":[
- {"type":"file","name":"MOVED_TO_mirror.fsf.org_dynebolic","size":0,"time":"1317826935"},
- {"type":"file","name":"index.html","size":107,"time":"1317827601"}
- ]}
- ]},
- {"type":"link","name":"non-gnu","target":"gnu/non-gnu","size":11,"time":"1082055542","contents":[]},
- {"type":"directory","name":"old-gnu","size":4096,"time":"1548360019","contents":[]},
- {"type":"link","name":"pub","target":".","size":1,"time":"1060090003","contents":[]},
- {"type":"directory","name":"savannah","size":4096,"time":"1194544006","contents":[
- {"type":"file","name":"README","size":473,"time":"1143758028"}
- ]},
- {"type":"directory","name":"third-party","size":4096,"time":"1059825710","contents":[
- {"type":"file","name":"README","size":374,"time":"983824071"}
- ]},
- {"type":"directory","name":"tmp","size":4096,"time":"1239072509","contents":[
- ]},
- {"type":"file","name":"tree.json.gz","size":0,"time":"1557684608"},
- {"type":"directory","name":"video","size":4096,"time":"1367963189","contents":[
- {"type":"file","name":".bash_history","size":27,"time":"1307027604"},
- {"type":"file","name":"stallmanupv.ogg.sig","size":536,"time":"1299776853"}
- ]},
- {"type":"file","name":"welcome.msg","size":2830,"time":"1545163301"}
-]},
-{"type":"report","directories":2743,"files":63983}
-]
diff --git a/swh/lister/gnu/tests/conftest.py b/swh/lister/gnu/tests/conftest.py
deleted file mode 100644
--- a/swh/lister/gnu/tests/conftest.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-import pytest
-
-
-@pytest.fixture
-def lister_under_test():
- return "gnu"
-
-
-@pytest.fixture
-def lister_gnu(swh_lister):
- for task_type in [
- {
- "type": "load-archive-files",
- "description": "Load archive repository",
- "backend_name": "swh.loader.packages.tasks.LoadArchive",
- "default_interval": "1 day",
- },
- ]:
- swh_lister.scheduler.create_task_type(task_type)
-
- return swh_lister
diff --git a/swh/lister/gnu/tests/find_tarballs_output.json b/swh/lister/gnu/tests/find_tarballs_output.json
deleted file mode 100644
--- a/swh/lister/gnu/tests/find_tarballs_output.json
+++ /dev/null
@@ -1,182 +0,0 @@
-[
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.1.tar.bz2",
- "date": "1495205979"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.1.tar.gz",
- "date": "1495205967"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.12-f39e-dirty.tar.gz",
- "date": "1494994222"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.3.tar.bz2",
- "date": "1520284021"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.3.tar.gz",
- "date": "1520284007"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.4.tar.bz2",
- "date": "1521742071"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.4.tar.gz",
- "date": "1521742057"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.5.tar.bz2",
- "date": "1525717261"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.5.tar.gz",
- "date": "1525717246"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.1.tar.bz2",
- "date": "1546205569"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.1.tar.gz",
- "date": "1546205555"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.tar.bz2",
- "date": "1546205025"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.tar.gz",
- "date": "1546205012"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_0-src.zip",
- "date": "898422900"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_2-src.zip",
- "date": "920018269"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_3-src.zip",
- "date": "936750503"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_4-src.tar.gz",
- "date": "944290190"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_5-src.tar.gz",
- "date": "944600462"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_6-src.tar.gz",
- "date": "952156231"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_7-src.tar.gz",
- "date": "952313061"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_1_0-src.tar.gz",
- "date": "969299378"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_0beta-src.tar.gz",
- "date": "977027031"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_1-src.tar.gz",
- "date": "981323331"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_2-src.tar.gz",
- "date": "981570576"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_3-src.tar.gz",
- "date": "982656672"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_4-src.tar.gz",
- "date": "1007952574"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_5-src.tar.gz",
- "date": "1008502483"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_6-src.tar.gz",
- "date": "1012641285"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-3.6.2.tar.gz",
- "date": "869814000"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.0.tar.gz",
- "date": "898422900"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.2.tar.gz",
- "date": "920018202"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.3.tar.gz",
- "date": "936750512"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.4.tar.gz",
- "date": "944290148"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.5.tar.gz",
- "date": "944599461"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.6.tar.gz",
- "date": "952156235"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.7.tar.gz",
- "date": "952313085"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.1.0.tar.gz",
- "date": "969299287"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.0beta.tar.gz",
- "date": "977027108"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.1.tar.gz",
- "date": "981323501"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.2.tar.gz",
- "date": "981562809"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.3.tar.gz",
- "date": "982657006"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.4.tar.gz",
- "date": "1007952745"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.5.tar.gz",
- "date": "1008466945"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.6.tar.gz",
- "date": "1012641715"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.7.tar.gz",
- "date": "1070057764"
- }
- ]
\ No newline at end of file
diff --git a/swh/lister/gnu/tests/test_lister.py b/swh/lister/gnu/tests/test_lister.py
--- a/swh/lister/gnu/tests/test_lister.py
+++ b/swh/lister/gnu/tests/test_lister.py
@@ -1,47 +1,36 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import logging
+from ..lister import GNULister
-logger = logging.getLogger(__name__)
+def test_gnu_lister(swh_scheduler, requests_mock_datadir):
+ lister = GNULister(scheduler=swh_scheduler)
-def test_gnu_lister(lister_gnu, requests_mock_datadir):
- lister_gnu.run()
+ stats = lister.run()
- r = lister_gnu.scheduler.search_tasks(task_type="load-archive-files")
- assert len(r) == 383
+ assert stats.pages == 1
+ assert stats.origins == 383
- for row in r:
- assert row["type"] == "load-archive-files"
- # arguments check
- args = row["arguments"]["args"]
- assert len(args) == 0
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
- # kwargs
- kwargs = row["arguments"]["kwargs"]
- assert set(kwargs.keys()) == {"url", "artifacts"}
+ assert len(scheduler_origins) == stats.origins
- url = kwargs["url"]
- assert url.startswith("https://ftp.gnu.org")
+ for origin in scheduler_origins:
+ assert origin.url.startswith(GNULister.GNU_FTP_URL)
+ assert origin.last_update is not None
+ assert "artifacts" in origin.extra_loader_arguments
+ assert len(origin.extra_loader_arguments["artifacts"]) > 0
- url_suffix = url.split("https://ftp.gnu.org")[1]
- assert "gnu" in url_suffix or "old-gnu" in url_suffix
- artifacts = kwargs["artifacts"]
- # check the artifact's structure
- artifact = artifacts[0]
- assert set(artifact.keys()) == {"url", "length", "time", "filename", "version"}
-
- for artifact in artifacts:
- logger.debug(artifact)
- # 'time' is an isoformat string now
- for key in ["url", "time", "filename", "version"]:
- assert isinstance(artifact[key], str)
- assert isinstance(artifact["length"], int)
-
- assert row["policy"] == "oneshot"
- assert row["priority"] is None
- assert row["retries_left"] == 3
+def test_gnu_lister_from_configfile(swh_scheduler_config, mocker):
+ load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar")
+ load_from_envvar.return_value = {
+ "scheduler": {"cls": "local", **swh_scheduler_config},
+ "credentials": {},
+ }
+ lister = GNULister.from_configfile()
+ assert lister.scheduler is not None
+ assert lister.credentials is not None
diff --git a/swh/lister/gnu/tests/test_tasks.py b/swh/lister/gnu/tests/test_tasks.py
--- a/swh/lister/gnu/tests/test_tasks.py
+++ b/swh/lister/gnu/tests/test_tasks.py
@@ -1,9 +1,9 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from unittest.mock import patch
+from swh.lister.pattern import ListerStats
def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
@@ -14,17 +14,17 @@
assert res.result == "OK"
-@patch("swh.lister.gnu.tasks.GNULister")
-def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
- # setup the mocked GNULister
- lister.return_value = lister
- lister.run.return_value = None
+def test_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ lister = mocker.patch("swh.lister.gnu.tasks.GNULister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=1, origins=300)
+ lister.run.return_value = stats
res = swh_scheduler_celery_app.send_task("swh.lister.gnu.tasks.GNUListerTask")
assert res
res.wait()
assert res.successful()
+ assert res.result == stats.dict()
- lister.assert_called_once_with()
- lister.db_last_index.assert_not_called()
+ lister.from_configfile.assert_called_once_with()
lister.run.assert_called_once_with()

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 4:27 AM (2 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221100

Event Timeline