Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7122823
D4969.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
20 KB
Subscribers
None
D4969.diff
View Options
diff --git a/swh/lister/gnu/__init__.py b/swh/lister/gnu/__init__.py
--- a/swh/lister/gnu/__init__.py
+++ b/swh/lister/gnu/__init__.py
@@ -1,14 +1,12 @@
-# Copyright (C) 2019 the Software Heritage developers
+# Copyright (C) 2019-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .lister import GNULister
- from .models import GNUModel
return {
- "models": [GNUModel],
"lister": GNULister,
"task_modules": ["%s.tasks" % __name__],
}
diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py
--- a/swh/lister/gnu/lister.py
+++ b/swh/lister/gnu/lister.py
@@ -1,112 +1,68 @@
-# Copyright (C) 2019 the Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
-from typing import Any, Dict, List
+from typing import Any, Iterator, Mapping
-from requests import Response
+import iso8601
-from swh.lister.core.simple_lister import SimpleLister
-from swh.lister.gnu.models import GNUModel
-from swh.lister.gnu.tree import GNUTree
-from swh.scheduler import utils
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
-logger = logging.getLogger(__name__)
-
-
-class GNULister(SimpleLister):
- MODEL = GNUModel
- LISTER_NAME = "gnu"
- instance = "gnu"
+from ..pattern import CredentialsType, StatelessLister
+from .tree import GNUTree
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- self.gnu_tree = GNUTree("https://ftp.gnu.org/tree.json.gz")
+logger = logging.getLogger(__name__)
- def task_dict(self, origin_type, origin_url, **kwargs):
- """Return task format dict
+GNUPageType = Mapping[str, Any]
- This is overridden from the lister_base as more information is
- needed for the ingestion task creation.
- This creates tasks with args and kwargs set, for example:
+class GNULister(StatelessLister[GNUPageType]):
+ """
+ List all GNU projects and associated artifacts.
+ """
- .. code-block:: python
+ LISTER_NAME = "GNU"
+ GNU_FTP_URL = "https://ftp.gnu.org"
- args:
- kwargs: {
- 'url': 'https://ftp.gnu.org/gnu/3dldf/',
- 'artifacts': [{
- 'url': 'https://...',
- 'time': '2003-12-09T21:43:20+00:00',
- 'length': 128,
- 'version': '1.0.1',
- 'filename': 'something-1.0.1.tar.gz',
- },
- ...
- ]
- }
+ def __init__(
+ self, scheduler: SchedulerInterface, credentials: CredentialsType = None,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ url=self.GNU_FTP_URL,
+ instance="GNU",
+ credentials=credentials,
+ )
+ self.gnu_tree = GNUTree(f"{self.url}/tree.json.gz")
+ def get_pages(self) -> Iterator[GNUPageType]:
"""
- artifacts = self.gnu_tree.artifacts[origin_url]
- assert origin_type == "tar"
- return utils.create_task_dict(
- "load-archive-files",
- kwargs.get("policy", "oneshot"),
- url=origin_url,
- artifacts=artifacts,
- retries_left=3,
- )
+ Yield a single page listing all GNU projects.
+ """
+ yield self.gnu_tree.projects
- def safely_issue_request(self, identifier: int) -> None:
- """Bypass the implementation. It's now the GNUTree which deals with
- querying the gnu mirror.
+ def get_origins_from_page(self, page: GNUPageType) -> Iterator[ListedOrigin]:
+ """
+ Iterate on all GNU projects and yield ListedOrigin instances.
+ """
+ assert self.lister_obj.id is not None
- As an implementation detail, we cannot change simply the base
- SimpleLister as other implementation still uses it. This shall be part
- of another refactoring pass.
+ artifacts = self.gnu_tree.artifacts
- """
- return None
-
- def list_packages(self, response: Response) -> List[Dict[str, Any]]:
- """List the actual gnu origins (package name) with their name, url and
- associated tarballs.
-
- Args:
- response: Unused
-
- Returns:
- List of packages name, url, last modification time::
-
- [
- {
- 'name': '3dldf',
- 'url': 'https://ftp.gnu.org/gnu/3dldf/',
- 'time_modified': '2003-12-09T20:43:20+00:00'
- },
- {
- 'name': '8sync',
- 'url': 'https://ftp.gnu.org/gnu/8sync/',
- 'time_modified': '2016-12-06T02:37:10+00:00'
- },
- ...
- ]
+ for project_name, project_info in page.items():
- """
- return list(self.gnu_tree.projects.values())
+ origin_url = project_info["url"]
+ last_update = iso8601.parse_date(project_info["time_modified"])
- def get_model_from_repo(self, repo: Dict[str, Any]) -> Dict[str, Any]:
- """Transform from repository representation to model
+ logger.debug("Found origin %s last updated on %s", origin_url, last_update)
- """
- return {
- "uid": repo["url"],
- "name": repo["name"],
- "full_name": repo["name"],
- "html_url": repo["url"],
- "origin_url": repo["url"],
- "time_last_updated": repo["time_modified"],
- "origin_type": "tar",
- }
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=origin_url,
+ visit_type="tar",
+ last_update=last_update,
+ extra_loader_arguments={"artifacts": artifacts[project_name]},
+ )
diff --git a/swh/lister/gnu/models.py b/swh/lister/gnu/models.py
deleted file mode 100644
--- a/swh/lister/gnu/models.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (C) 2019 the Software Heritage developers
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-from sqlalchemy import Column, DateTime, String
-
-from ..core.models import ModelBase
-
-
-class GNUModel(ModelBase):
- """a GNU repository representation
-
- """
-
- __tablename__ = "gnu_repo"
-
- uid = Column(String, primary_key=True)
- time_last_updated = Column(DateTime)
diff --git a/swh/lister/gnu/tasks.py b/swh/lister/gnu/tasks.py
--- a/swh/lister/gnu/tasks.py
+++ b/swh/lister/gnu/tasks.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019 the Software Heritage developers
+# Copyright (C) 2019-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -10,7 +10,7 @@
@shared_task(name=__name__ + ".GNUListerTask")
def list_gnu_full(**lister_args):
"""List lister for the GNU source code archive"""
- return GNULister(**lister_args).run()
+ return GNULister.from_configfile(**lister_args).run().dict()
@shared_task(name=__name__ + ".ping")
diff --git a/swh/lister/gnu/tests/api_response.json b/swh/lister/gnu/tests/api_response.json
deleted file mode 100644
--- a/swh/lister/gnu/tests/api_response.json
+++ /dev/null
@@ -1,37 +0,0 @@
-[{"type":"directory","name": ".","contents":[
- {"type":"file","name":".footer.shtml","size":444,"time":"1359994299"},
- {"type":"file","name":"find.txt.gz","size":261428,"time":"1557684608"},
- {"type":"directory","name":"gnu","size":12288,"time":"1556742017","contents":[]},
- {"type":"directory","name":"gnu+linux-distros","size":4096,"time":"1299783002","contents":[
- {"type":"directory","name":"ututo-e","size":4096,"time":"1487780066","contents":[
- {"type":"file","name":"README","size":48,"time":"1487780066"},
- {"type":"file","name":"index.html","size":158,"time":"1487780054"}
- ]}
- ]},
- {"type":"file","name":"ls-lrRt.txt.gz","size":480054,"time":"1557684607"},
- {"type":"directory","name":"mirrors","size":4096,"time":"1114010630","contents":[
- {"type":"directory","name":"dynebolic","size":4096,"time":"1317827602","contents":[
- {"type":"file","name":"MOVED_TO_mirror.fsf.org_dynebolic","size":0,"time":"1317826935"},
- {"type":"file","name":"index.html","size":107,"time":"1317827601"}
- ]}
- ]},
- {"type":"link","name":"non-gnu","target":"gnu/non-gnu","size":11,"time":"1082055542","contents":[]},
- {"type":"directory","name":"old-gnu","size":4096,"time":"1548360019","contents":[]},
- {"type":"link","name":"pub","target":".","size":1,"time":"1060090003","contents":[]},
- {"type":"directory","name":"savannah","size":4096,"time":"1194544006","contents":[
- {"type":"file","name":"README","size":473,"time":"1143758028"}
- ]},
- {"type":"directory","name":"third-party","size":4096,"time":"1059825710","contents":[
- {"type":"file","name":"README","size":374,"time":"983824071"}
- ]},
- {"type":"directory","name":"tmp","size":4096,"time":"1239072509","contents":[
- ]},
- {"type":"file","name":"tree.json.gz","size":0,"time":"1557684608"},
- {"type":"directory","name":"video","size":4096,"time":"1367963189","contents":[
- {"type":"file","name":".bash_history","size":27,"time":"1307027604"},
- {"type":"file","name":"stallmanupv.ogg.sig","size":536,"time":"1299776853"}
- ]},
- {"type":"file","name":"welcome.msg","size":2830,"time":"1545163301"}
-]},
-{"type":"report","directories":2743,"files":63983}
-]
diff --git a/swh/lister/gnu/tests/conftest.py b/swh/lister/gnu/tests/conftest.py
deleted file mode 100644
--- a/swh/lister/gnu/tests/conftest.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-import pytest
-
-
-@pytest.fixture
-def lister_under_test():
- return "gnu"
-
-
-@pytest.fixture
-def lister_gnu(swh_lister):
- for task_type in [
- {
- "type": "load-archive-files",
- "description": "Load archive repository",
- "backend_name": "swh.loader.packages.tasks.LoadArchive",
- "default_interval": "1 day",
- },
- ]:
- swh_lister.scheduler.create_task_type(task_type)
-
- return swh_lister
diff --git a/swh/lister/gnu/tests/find_tarballs_output.json b/swh/lister/gnu/tests/find_tarballs_output.json
deleted file mode 100644
--- a/swh/lister/gnu/tests/find_tarballs_output.json
+++ /dev/null
@@ -1,182 +0,0 @@
-[
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.1.tar.bz2",
- "date": "1495205979"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.1.tar.gz",
- "date": "1495205967"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.12-f39e-dirty.tar.gz",
- "date": "1494994222"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.3.tar.bz2",
- "date": "1520284021"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.3.tar.gz",
- "date": "1520284007"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.4.tar.bz2",
- "date": "1521742071"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.4.tar.gz",
- "date": "1521742057"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.5.tar.bz2",
- "date": "1525717261"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.2.5.tar.gz",
- "date": "1525717246"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.1.tar.bz2",
- "date": "1546205569"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.1.tar.gz",
- "date": "1546205555"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.tar.bz2",
- "date": "1546205025"
- },
- {
- "archive": "https://ftp.gnu.org/gnu/artanis/artanis-0.3.tar.gz",
- "date": "1546205012"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_0-src.zip",
- "date": "898422900"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_2-src.zip",
- "date": "920018269"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_3-src.zip",
- "date": "936750503"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_4-src.tar.gz",
- "date": "944290190"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_5-src.tar.gz",
- "date": "944600462"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_6-src.tar.gz",
- "date": "952156231"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_0_7-src.tar.gz",
- "date": "952313061"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_1_0-src.tar.gz",
- "date": "969299378"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_0beta-src.tar.gz",
- "date": "977027031"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_1-src.tar.gz",
- "date": "981323331"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_2-src.tar.gz",
- "date": "981570576"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_3-src.tar.gz",
- "date": "982656672"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_4-src.tar.gz",
- "date": "1007952574"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_5-src.tar.gz",
- "date": "1008502483"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/winboard/winboard-4_2_6-src.tar.gz",
- "date": "1012641285"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-3.6.2.tar.gz",
- "date": "869814000"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.0.tar.gz",
- "date": "898422900"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.2.tar.gz",
- "date": "920018202"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.3.tar.gz",
- "date": "936750512"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.4.tar.gz",
- "date": "944290148"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.5.tar.gz",
- "date": "944599461"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.6.tar.gz",
- "date": "952156235"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.0.7.tar.gz",
- "date": "952313085"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.1.0.tar.gz",
- "date": "969299287"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.0beta.tar.gz",
- "date": "977027108"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.1.tar.gz",
- "date": "981323501"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.2.tar.gz",
- "date": "981562809"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.3.tar.gz",
- "date": "982657006"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.4.tar.gz",
- "date": "1007952745"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.5.tar.gz",
- "date": "1008466945"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.6.tar.gz",
- "date": "1012641715"
- },
- {
- "archive": "https://ftp.gnu.org/old-gnu/xboard/xboard-4.2.7.tar.gz",
- "date": "1070057764"
- }
- ]
\ No newline at end of file
diff --git a/swh/lister/gnu/tests/test_lister.py b/swh/lister/gnu/tests/test_lister.py
--- a/swh/lister/gnu/tests/test_lister.py
+++ b/swh/lister/gnu/tests/test_lister.py
@@ -1,47 +1,36 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import logging
+from ..lister import GNULister
-logger = logging.getLogger(__name__)
+def test_gnu_lister(swh_scheduler, requests_mock_datadir):
+ lister = GNULister(scheduler=swh_scheduler)
-def test_gnu_lister(lister_gnu, requests_mock_datadir):
- lister_gnu.run()
+ stats = lister.run()
- r = lister_gnu.scheduler.search_tasks(task_type="load-archive-files")
- assert len(r) == 383
+ assert stats.pages == 1
+ assert stats.origins == 383
- for row in r:
- assert row["type"] == "load-archive-files"
- # arguments check
- args = row["arguments"]["args"]
- assert len(args) == 0
+ scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
- # kwargs
- kwargs = row["arguments"]["kwargs"]
- assert set(kwargs.keys()) == {"url", "artifacts"}
+ assert len(scheduler_origins) == stats.origins
- url = kwargs["url"]
- assert url.startswith("https://ftp.gnu.org")
+ for origin in scheduler_origins:
+ assert origin.url.startswith(GNULister.GNU_FTP_URL)
+ assert origin.last_update is not None
+ assert "artifacts" in origin.extra_loader_arguments
+ assert len(origin.extra_loader_arguments["artifacts"]) > 0
- url_suffix = url.split("https://ftp.gnu.org")[1]
- assert "gnu" in url_suffix or "old-gnu" in url_suffix
- artifacts = kwargs["artifacts"]
- # check the artifact's structure
- artifact = artifacts[0]
- assert set(artifact.keys()) == {"url", "length", "time", "filename", "version"}
-
- for artifact in artifacts:
- logger.debug(artifact)
- # 'time' is an isoformat string now
- for key in ["url", "time", "filename", "version"]:
- assert isinstance(artifact[key], str)
- assert isinstance(artifact["length"], int)
-
- assert row["policy"] == "oneshot"
- assert row["priority"] is None
- assert row["retries_left"] == 3
+def test_gnu_lister_from_configfile(swh_scheduler_config, mocker):
+ load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar")
+ load_from_envvar.return_value = {
+ "scheduler": {"cls": "local", **swh_scheduler_config},
+ "credentials": {},
+ }
+ lister = GNULister.from_configfile()
+ assert lister.scheduler is not None
+ assert lister.credentials is not None
diff --git a/swh/lister/gnu/tests/test_tasks.py b/swh/lister/gnu/tests/test_tasks.py
--- a/swh/lister/gnu/tests/test_tasks.py
+++ b/swh/lister/gnu/tests/test_tasks.py
@@ -1,9 +1,9 @@
-# Copyright (C) 2019-2020 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from unittest.mock import patch
+from swh.lister.pattern import ListerStats
def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
@@ -14,17 +14,17 @@
assert res.result == "OK"
-@patch("swh.lister.gnu.tasks.GNULister")
-def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
- # setup the mocked GNULister
- lister.return_value = lister
- lister.run.return_value = None
+def test_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+ lister = mocker.patch("swh.lister.gnu.tasks.GNULister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=1, origins=300)
+ lister.run.return_value = stats
res = swh_scheduler_celery_app.send_task("swh.lister.gnu.tasks.GNUListerTask")
assert res
res.wait()
assert res.successful()
+ assert res.result == stats.dict()
- lister.assert_called_once_with()
- lister.db_last_index.assert_not_called()
+ lister.from_configfile.assert_called_once_with()
lister.run.assert_called_once_with()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Dec 17, 4:27 AM (2 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3221100
Attached To
D4969: gnu: Reimplement lister using new Lister API
Event Timeline
Log In to Comment