diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -4,3 +4,4 @@ types-click types-pyyaml types-requests +types-python-dateutil diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ python_debian +python-dateutil requests setuptools iso8601 diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -62,6 +62,7 @@ lister.gitea=swh.lister.gitea:register lister.github=swh.lister.github:register lister.gitlab=swh.lister.gitlab:register + lister.golang=swh.lister.golang:register lister.gnu=swh.lister.gnu:register lister.launchpad=swh.lister.launchpad:register lister.npm=swh.lister.npm:register diff --git a/swh/lister/golang/__init__.py b/swh/lister/golang/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/golang/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import GolangLister + + return { + "lister": GolangLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/golang/lister.py @@ -0,0 +1,127 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime +import json +import logging +from typing import Any, Iterator, List, Optional, Tuple + +from dateutil import parser +import requests +from tenacity import before_sleep_log + +from swh.lister.utils import retry_policy_generic, throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +GolangPageType = List[Any] + + +class GolangLister(StatelessLister[GolangPageType]): + """ + List all Golang modules and send associated origins to scheduler. + + The lister queries the Golang module index, whose documentation can be found + at https://index.golang.org + """ + + GOLANG_MODULES_INDEX_URL = "https://index.golang.org/index" + # `limit` seems to be... limited to 2000. + GOLANG_MODULES_INDEX_LIMIT = 2000 + LISTER_NAME = "Golang" + + def __init__( + self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, + url=self.GOLANG_MODULES_INDEX_URL, + instance="Golang", + credentials=credentials, + ) + + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT} + ) + + @throttling_retry( + retry=retry_policy_generic, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + def api_request(self, url: str) -> List[str]: + logger.debug("Fetching URL %s", url) + + response = self.session.get(url) + + if response.status_code not in (200, 304): + # Log response content to ease debugging + logger.warning( + "Unexpected HTTP status code %s for URL %s", + response.status_code, + response.url, + ) + + response.raise_for_status() + + return response.text.split() + + def get_single_page( + self, since: Optional[datetime] = None + ) -> Tuple[GolangPageType, Optional[datetime]]: + limit_arg = f"?limit={self.GOLANG_MODULES_INDEX_LIMIT}" + url = self.url + limit_arg + if since is not None: + # The Golang index does not understand `+00:00` for some reason + # and expects the "timezone zero" notation instead. This works + # because all times are UTC. + as_date = since.isoformat().replace("+00:00", "Z") + url = url + f"&since={as_date}" + + entries = self.api_request(url) + page: GolangPageType = [] + if not entries: + return page, since + + for as_json in entries: + entry = json.loads(as_json) + timestamp = parser.isoparse(entry["Timestamp"]) + # We've already parsed it and we'll need the datetime later, save it + entry["Timestamp"] = timestamp + page.append(entry) + # The index is guaranteed to be sorted in chronological order + since = timestamp + + return page, since + + def get_pages(self) -> Iterator[GolangPageType]: + page, since = self.get_single_page() + + while page: + yield page + page, since = self.get_single_page(since=since) + + def get_origins_from_page(self, page: GolangPageType) -> Iterator[ListedOrigin]: + """ + Iterate on all Golang projects and yield ListedOrigin instances. + """ + assert self.lister_obj.id is not None + + for module in page: + path = module["Path"] + version = module["Version"] + # See https://proxy.golang.org for documentation on the proxy protocol + origin_url = f"{path}/@v/{version}.zip" + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="golang", + last_update=module["Timestamp"], + ) diff --git a/swh/lister/golang/tasks.py b/swh/lister/golang/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/golang/tasks.py @@ -0,0 +1,18 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from .lister import GolangLister + + +@shared_task(name=__name__ + ".GolangListerTask") +def list_golang(**lister_args): + "List the Golang module registry" + return GolangLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/golang/tests/__init__.py b/swh/lister/golang/tests/__init__.py new file mode 100644 diff --git a/swh/lister/golang/tests/data/page-1.txt b/swh/lister/golang/tests/data/page-1.txt new file mode 100644 --- /dev/null +++ b/swh/lister/golang/tests/data/page-1.txt @@ -0,0 +1,5 @@ +{"Path":"golang.org/x/text","Version":"v0.3.0","Timestamp":"2019-04-10T19:08:52.997264Z"} +{"Path":"github.com/oklog/ulid","Version":"v1.3.1","Timestamp":"2019-04-11T18:47:23.234198Z"} +{"Path":"collectd.org","Version":"v0.3.0","Timestamp":"2019-04-11T18:47:25.450546Z"} +{"Path":"github.com/nats-io/nuid","Version":"v1.0.1","Timestamp":"2019-04-11T18:47:28.102348Z"} +{"Path":"github.com/bmizerany/pat","Version":"v0.0.0-20170815010413-6226ea591a40","Timestamp":"2019-04-11T18:47:29.390564Z"} \ No newline at end of file diff --git a/swh/lister/golang/tests/data/page-2.txt b/swh/lister/golang/tests/data/page-2.txt new file mode 100644 --- /dev/null +++ b/swh/lister/golang/tests/data/page-2.txt @@ -0,0 +1,4 @@ +{"Path":"github.com/djherbis/buffer","Version":"v1.0.0","Timestamp":"2019-04-11T18:47:29.974874Z"} +{"Path":"github.com/djherbis/nio","Version":"v2.0.3+incompatible","Timestamp":"2019-04-11T18:47:32.283312Z"} +{"Path":"github.com/gobuffalo/buffalo-plugins","Version":"v1.13.0","Timestamp":"2019-04-15T13:54:34.222985Z"} +{"Path":"github.com/markbates/refresh","Version":"v1.7.1","Timestamp":"2019-04-15T13:54:35.250835Z"} \ No newline at end of file diff --git a/swh/lister/golang/tests/data/page-3.txt b/swh/lister/golang/tests/data/page-3.txt new file mode 100644 --- /dev/null +++ b/swh/lister/golang/tests/data/page-3.txt @@ -0,0 +1,10 @@ +{"Path":"github.com/mitchellh/go-homedir","Version":"v1.1.0","Timestamp":"2019-04-15T13:54:35.678214Z"} +{"Path":"github.com/gobuffalo/packr","Version":"v1.22.0","Timestamp":"2019-04-15T13:54:35.6889Z"} +{"Path":"golang.org/x/sys","Version":"v0.0.0-20190220154126-629670e5acc5","Timestamp":"2019-04-15T13:54:37.555525Z"} +{"Path":"github.com/gobuffalo/genny","Version":"v0.0.0-20190104222617-a71664fc38e7","Timestamp":"2019-04-15T13:54:37.841547Z"} +{"Path":"github.com/blang/semver","Version":"v3.5.1+incompatible","Timestamp":"2019-04-15T13:54:39.107258Z"} +{"Path":"github.com/gobuffalo/buffalo-pop","Version":"v1.3.0","Timestamp":"2019-04-15T13:54:39.135792Z"} +{"Path":"golang.org/x/tools","Version":"v0.0.0-20190131142011-8dbcc66f33bb","Timestamp":"2019-04-15T13:54:39.250757Z"} +{"Path":"github.com/gobuffalo/clara","Version":"v0.4.1","Timestamp":"2019-04-15T13:54:40.651916Z"} +{"Path":"golang.org/x/tools","Version":"v0.0.0-20181213190329-bbccd8cae4a9","Timestamp":"2019-04-15T13:54:41.905064Z"} +{"Path":"github.com/pkg/errors","Version":"v0.0.0-20161002052512-839d9e913e06","Timestamp":"2019-04-18T02:07:41.336899Z"} \ No newline at end of file diff --git a/swh/lister/golang/tests/test_lister.py b/swh/lister/golang/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/golang/tests/test_lister.py @@ -0,0 +1,109 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from pathlib import Path + +from dateutil import parser + +from swh.lister.golang.lister import GolangLister +from swh.lister.tests.test_utils import assert_sleep_calls +from swh.lister.utils import WAIT_EXP_BASE + +expected_listed = [ + ("collectd.org/@v/v0.3.0.zip", "2019-04-11T18:47:25.450546+00:00"), + ( + "github.com/blang/semver/@v/v3.5.1+incompatible.zip", + "2019-04-15T13:54:39.107258+00:00", + ), + ( + "github.com/bmizerany/pat/@v/v0.0.0-20170815010413-6226ea591a40.zip", + "2019-04-11T18:47:29.390564+00:00", + ), + ("github.com/djherbis/buffer/@v/v1.0.0.zip", "2019-04-11T18:47:29.974874+00:00",), + ( + "github.com/djherbis/nio/@v/v2.0.3+incompatible.zip", + "2019-04-11T18:47:32.283312+00:00", + ), + ( + "github.com/gobuffalo/buffalo-plugins/@v/v1.13.0.zip", + "2019-04-15T13:54:34.222985+00:00", + ), + ( + "github.com/gobuffalo/buffalo-pop/@v/v1.3.0.zip", + "2019-04-15T13:54:39.135792+00:00", + ), + ("github.com/gobuffalo/clara/@v/v0.4.1.zip", "2019-04-15T13:54:40.651916+00:00",), + ( + "github.com/gobuffalo/genny/@v/v0.0.0-20190104222617-a71664fc38e7.zip", + "2019-04-15T13:54:37.841547+00:00", + ), + ("github.com/gobuffalo/packr/@v/v1.22.0.zip", "2019-04-15T13:54:35.688900+00:00",), + ("github.com/markbates/refresh/@v/v1.7.1.zip", "2019-04-15T13:54:35.250835+00:00",), + ( + "github.com/mitchellh/go-homedir/@v/v1.1.0.zip", + "2019-04-15T13:54:35.678214+00:00", + ), + ("github.com/nats-io/nuid/@v/v1.0.1.zip", "2019-04-11T18:47:28.102348+00:00",), + ("github.com/oklog/ulid/@v/v1.3.1.zip", "2019-04-11T18:47:23.234198+00:00",), + ( + "github.com/pkg/errors/@v/v0.0.0-20161002052512-839d9e913e06.zip", + "2019-04-18T02:07:41.336899+00:00", + ), + ( + "golang.org/x/sys/@v/v0.0.0-20190220154126-629670e5acc5.zip", + "2019-04-15T13:54:37.555525+00:00", + ), + ("golang.org/x/text/@v/v0.3.0.zip", "2019-04-10T19:08:52.997264+00:00"), + ( + "golang.org/x/tools/@v/v0.0.0-20181213190329-bbccd8cae4a9.zip", + "2019-04-15T13:54:41.905064+00:00", + ), + ( + "golang.org/x/tools/@v/v0.0.0-20190131142011-8dbcc66f33bb.zip", + "2019-04-15T13:54:39.250757+00:00", + ), +] + + +def test_golang_lister(swh_scheduler, mocker, requests_mock, datadir): + # first listing, should return one origin per package + lister = GolangLister(scheduler=swh_scheduler) + + # Exponential retries take a long time, so stub time.sleep + mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep") + + responses = [] + for file in Path(datadir).glob("page-*.txt"): + # Test that throttling and server errors are retries + responses.append({"text": "", "status_code": 429}) + responses.append({"text": "", "status_code": 500}) + responses.append({"text": file.read_text(), "status_code": 200}) + + # Returns empty text when the list is exhausted + responses.append({"text": "", "status_code": 200}) + requests_mock.get(GolangLister.GOLANG_MODULES_INDEX_URL, responses) + + stats = lister.run() + + assert stats.pages == 3 + # The two `golang.org/x/tools` versions are listed as separate origins + assert stats.origins == 19 + + scheduler_origins = sorted( + swh_scheduler.get_listed_origins(lister.lister_obj.id).results, + key=lambda x: x.url, + ) + + for scheduled, (url, timestamp) in zip(scheduler_origins, expected_listed): + assert scheduled.url == url + assert scheduled.last_update == parser.isoparse(timestamp) + assert scheduled.visit_type == "golang" + + assert len(scheduler_origins) == len(expected_listed) + + # Test `time.sleep` is called with exponential retries + assert_sleep_calls( + mocker, mocked_sleep, [1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE] + )