Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/gnu/lister.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import logging | import logging | ||||
from typing import Any, Iterator, Mapping | from typing import Any, Iterator, Mapping, Optional | ||||
import iso8601 | import iso8601 | ||||
from swh.scheduler.interface import SchedulerInterface | from swh.scheduler.interface import SchedulerInterface | ||||
from swh.scheduler.model import ListedOrigin | from swh.scheduler.model import ListedOrigin | ||||
from ..pattern import CredentialsType, StatelessLister | from ..pattern import CredentialsType, StatelessLister | ||||
from .tree import GNUTree | from .tree import GNUTree | ||||
Show All 15 Lines | def __init__( | ||||
self, scheduler: SchedulerInterface, credentials: CredentialsType = None, | self, scheduler: SchedulerInterface, credentials: CredentialsType = None, | ||||
): | ): | ||||
super().__init__( | super().__init__( | ||||
scheduler=scheduler, | scheduler=scheduler, | ||||
url=self.GNU_FTP_URL, | url=self.GNU_FTP_URL, | ||||
instance="GNU", | instance="GNU", | ||||
credentials=credentials, | credentials=credentials, | ||||
) | ) | ||||
self.gnu_tree = GNUTree(f"{self.url}/tree.json.gz") | # no side-effect calls in constructor, if extra state is needed, as preconized | ||||
# by the pattern docstring, this must happen in the get_pages method. | |||||
self.gnu_tree: Optional[GNUTree] = None | |||||
def get_pages(self) -> Iterator[GNUPageType]: | def get_pages(self) -> Iterator[GNUPageType]: | ||||
""" | """ | ||||
Yield a single page listing all GNU projects. | Yield a single page listing all GNU projects. | ||||
""" | """ | ||||
# first fetch the manifest to parse | |||||
self.gnu_tree = GNUTree(f"{self.url}/tree.json.gz") | |||||
yield self.gnu_tree.projects | yield self.gnu_tree.projects | ||||
def get_origins_from_page(self, page: GNUPageType) -> Iterator[ListedOrigin]: | def get_origins_from_page(self, page: GNUPageType) -> Iterator[ListedOrigin]: | ||||
""" | """ | ||||
Iterate on all GNU projects and yield ListedOrigin instances. | Iterate on all GNU projects and yield ListedOrigin instances. | ||||
""" | """ | ||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
assert self.gnu_tree is not None | |||||
artifacts = self.gnu_tree.artifacts | artifacts = self.gnu_tree.artifacts | ||||
for project_name, project_info in page.items(): | for project_name, project_info in page.items(): | ||||
origin_url = project_info["url"] | origin_url = project_info["url"] | ||||
last_update = iso8601.parse_date(project_info["time_modified"]) | last_update = iso8601.parse_date(project_info["time_modified"]) | ||||
Show All 9 Lines |