Changeset View
Changeset View
Standalone View
Standalone View
swh/lister/github/lister.py
# Copyright (C) 2020 The Software Heritage developers | # Copyright (C) 2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from dataclasses import asdict, dataclass | from dataclasses import asdict, dataclass | ||||
import datetime | import datetime | ||||
import logging | import logging | ||||
import random | import random | ||||
import time | import time | ||||
from typing import Any, Dict, Iterator, List, Optional | from typing import Any, Dict, Iterator, List, Optional, Set | ||||
from urllib.parse import parse_qs, urlparse | from urllib.parse import parse_qs, urlparse | ||||
import iso8601 | import iso8601 | ||||
import requests | import requests | ||||
from tenacity import ( | from tenacity import ( | ||||
retry, | retry, | ||||
retry_any, | retry_any, | ||||
retry_if_exception_type, | retry_if_exception_type, | ||||
▲ Show 20 Lines • Show All 280 Lines • ▼ Show 20 Lines | def get_origins_from_page( | ||||
self, page: List[Dict[str, Any]] | self, page: List[Dict[str, Any]] | ||||
) -> Iterator[ListedOrigin]: | ) -> Iterator[ListedOrigin]: | ||||
"""Convert a page of GitHub repositories into a list of ListedOrigins. | """Convert a page of GitHub repositories into a list of ListedOrigins. | ||||
This records the html_url, as well as the pushed_at value if it exists. | This records the html_url, as well as the pushed_at value if it exists. | ||||
""" | """ | ||||
assert self.lister_obj.id is not None | assert self.lister_obj.id is not None | ||||
seen_in_page: Set[str] = set() | |||||
for repo in page: | for repo in page: | ||||
if not repo: | if not repo: | ||||
# null repositories in listings happen sometimes... | # null repositories in listings happen sometimes... | ||||
continue | continue | ||||
if repo["html_url"] in seen_in_page: | |||||
continue | |||||
seen_in_page.add(repo["html_url"]) | |||||
pushed_at_str = repo.get("pushed_at") | pushed_at_str = repo.get("pushed_at") | ||||
pushed_at: Optional[datetime.datetime] = None | pushed_at: Optional[datetime.datetime] = None | ||||
if pushed_at_str: | if pushed_at_str: | ||||
pushed_at = iso8601.parse_date(pushed_at_str) | pushed_at = iso8601.parse_date(pushed_at_str) | ||||
yield ListedOrigin( | yield ListedOrigin( | ||||
lister_id=self.lister_obj.id, | lister_id=self.lister_obj.id, | ||||
url=repo["html_url"], | url=repo["html_url"], | ||||
Show All 31 Lines |