diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` +- `swh.lister.golang` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` @@ -38,7 +39,7 @@ ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, -`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) +`gitea`, `github`, `gitlab`, `gnu`, `golang`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -67,6 +67,7 @@ lister.github=swh.lister.github:register lister.gitlab=swh.lister.gitlab:register lister.gnu=swh.lister.gnu:register + lister.golang=swh.lister.golang:register lister.launchpad=swh.lister.launchpad:register lister.npm=swh.lister.npm:register lister.opam=swh.lister.opam:register diff --git a/swh/lister/golang/__init__.py b/swh/lister/golang/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/golang/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import GolangLister + + return { + "lister": GolangLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/golang/lister.py @@ -0,0 +1,145 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime +import json +import logging +from typing import Any, Dict, Iterator, List, Optional, Tuple + +import iso8601 +import requests +from tenacity import before_sleep_log + +from swh.lister.utils import retry_policy_generic, throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +GolangPageType = List[Dict[str, Any]] + + +class GolangLister(StatelessLister[GolangPageType]): + """ + List all Golang modules and send associated origins to scheduler. + + The lister queries the Golang module index, whose documentation can be found + at https://index.golang.org + """ + + GOLANG_MODULES_INDEX_URL = "https://index.golang.org/index" + # `limit` seems to be... limited to 2000. + GOLANG_MODULES_INDEX_LIMIT = 2000 + LISTER_NAME = "Golang" + + def __init__( + self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, + url=self.GOLANG_MODULES_INDEX_URL, + instance="Golang", + credentials=credentials, + ) + + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT} + ) + + @throttling_retry( + retry=retry_policy_generic, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + def api_request(self, url: str) -> List[str]: + logger.debug("Fetching URL %s", url) + + response = self.session.get(url) + + if response.status_code not in (200, 304): + # Log response content to ease debugging + logger.warning( + "Unexpected HTTP status code %s for URL %s", + response.status_code, + response.url, + ) + + response.raise_for_status() + + return response.text.split() + + def get_single_page( + self, since: Optional[datetime] = None + ) -> Tuple[GolangPageType, Optional[datetime]]: + """Return a page from the API and the timestamp of its last entry. + Since all entries are sorted by chronological order, the timestamp is useful + both for pagination and later for incremental runs.""" + url = f"{self.url}?limit={self.GOLANG_MODULES_INDEX_LIMIT}" + if since is not None: + # The Golang index does not understand `+00:00` for some reason + # and expects the "timezone zero" notation instead. This works + # because all times are UTC. + utc_offset = since.utcoffset() + assert ( + utc_offset is not None and utc_offset.total_seconds() == 0 + ), "Non-UTC datetime" + as_date = since.isoformat().replace("+00:00", "Z") + url = f"{url}&since={as_date}" + + entries = self.api_request(url) + page: GolangPageType = [] + if not entries: + return page, since + + for as_json in entries: + entry = json.loads(as_json) + timestamp = iso8601.parse_date(entry["Timestamp"]) + # We've already parsed it and we'll need the datetime later, save it + entry["Timestamp"] = timestamp + page.append(entry) + # The index is guaranteed to be sorted in chronological order + since = timestamp + + return page, since + + def get_pages(self) -> Iterator[GolangPageType]: + page, since = self.get_single_page() + last_since = since + while page: + yield page + page, since = self.get_single_page(since=since) + if last_since == since: + # The index returns packages whose timestamp are greater or + # equal to the date provided as parameter, which will create + # an infinite loop if not stopped here. + return [] + last_since = since + + def get_origins_from_page(self, page: GolangPageType) -> Iterator[ListedOrigin]: + """ + Iterate on all Golang projects and yield ListedOrigin instances. + """ + assert self.lister_obj.id is not None + + for module in page: + path = module["Path"] + # The loader will be expected to use the golang proxy to do the + # actual downloading. We're using `pkg.go.dev` so that the URL points + # to somewhere useful for a human instead of an (incomplete) API path. + origin_url = f"https://pkg.go.dev/{path}" + + # Since the Go index lists versions and not just packages, there will + # be duplicates. Fortunately, `ListedOrigins` are "upserted" server-side, + # so only the last timestamp will be used, with no duplicates. + # Performance should not be an issue as they are sent to the db in bulk. + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="golang", + last_update=module["Timestamp"], + ) diff --git a/swh/lister/golang/tasks.py b/swh/lister/golang/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/golang/tasks.py @@ -0,0 +1,18 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from .lister import GolangLister + + +@shared_task(name=__name__ + ".FullGolangLister") +def list_golang(**lister_args): + "List the Golang module registry" + return GolangLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/golang/tests/__init__.py b/swh/lister/golang/tests/__init__.py new file mode 100644 diff --git a/swh/lister/golang/tests/data/page-1.txt b/swh/lister/golang/tests/data/page-1.txt new file mode 100644 --- /dev/null +++ b/swh/lister/golang/tests/data/page-1.txt @@ -0,0 +1,5 @@ +{"Path":"golang.org/x/text","Version":"v0.3.0","Timestamp":"2019-04-10T19:08:52.997264Z"} +{"Path":"github.com/oklog/ulid","Version":"v1.3.1","Timestamp":"2019-04-11T18:47:23.234198Z"} +{"Path":"collectd.org","Version":"v0.3.0","Timestamp":"2019-04-11T18:47:25.450546Z"} +{"Path":"github.com/nats-io/nuid","Version":"v1.0.1","Timestamp":"2019-04-11T18:47:28.102348Z"} +{"Path":"github.com/bmizerany/pat","Version":"v0.0.0-20170815010413-6226ea591a40","Timestamp":"2019-04-11T18:47:29.390564Z"} \ No newline at end of file diff --git a/swh/lister/golang/tests/data/page-2.txt b/swh/lister/golang/tests/data/page-2.txt new file mode 100644 --- /dev/null +++ b/swh/lister/golang/tests/data/page-2.txt @@ -0,0 +1,4 @@ +{"Path":"github.com/djherbis/buffer","Version":"v1.0.0","Timestamp":"2019-04-11T18:47:29.974874Z"} +{"Path":"github.com/djherbis/nio","Version":"v2.0.3+incompatible","Timestamp":"2019-04-11T18:47:32.283312Z"} +{"Path":"github.com/gobuffalo/buffalo-plugins","Version":"v1.13.0","Timestamp":"2019-04-15T13:54:34.222985Z"} +{"Path":"github.com/markbates/refresh","Version":"v1.7.1","Timestamp":"2019-04-15T13:54:35.250835Z"} \ No newline at end of file diff --git a/swh/lister/golang/tests/data/page-3.txt b/swh/lister/golang/tests/data/page-3.txt new file mode 100644 --- /dev/null +++ b/swh/lister/golang/tests/data/page-3.txt @@ -0,0 +1,10 @@ +{"Path":"github.com/mitchellh/go-homedir","Version":"v1.1.0","Timestamp":"2019-04-15T13:54:35.678214Z"} +{"Path":"github.com/gobuffalo/packr","Version":"v1.22.0","Timestamp":"2019-04-15T13:54:35.6889Z"} +{"Path":"golang.org/x/sys","Version":"v0.0.0-20190220154126-629670e5acc5","Timestamp":"2019-04-15T13:54:37.555525Z"} +{"Path":"github.com/gobuffalo/genny","Version":"v0.0.0-20190104222617-a71664fc38e7","Timestamp":"2019-04-15T13:54:37.841547Z"} +{"Path":"github.com/blang/semver","Version":"v3.5.1+incompatible","Timestamp":"2019-04-15T13:54:39.107258Z"} +{"Path":"github.com/gobuffalo/buffalo-pop","Version":"v1.3.0","Timestamp":"2019-04-15T13:54:39.135792Z"} +{"Path":"golang.org/x/tools","Version":"v0.0.0-20190131142011-8dbcc66f33bb","Timestamp":"2019-04-15T13:54:39.250757Z"} +{"Path":"github.com/gobuffalo/clara","Version":"v0.4.1","Timestamp":"2019-04-15T13:54:40.651916Z"} +{"Path":"golang.org/x/tools","Version":"v0.0.0-20181213190329-bbccd8cae4a9","Timestamp":"2019-04-15T13:54:41.905064Z"} +{"Path":"github.com/pkg/errors","Version":"v0.0.0-20161002052512-839d9e913e06","Timestamp":"2019-04-18T02:07:41.336899Z"} \ No newline at end of file diff --git a/swh/lister/golang/tests/test_lister.py b/swh/lister/golang/tests/test_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/golang/tests/test_lister.py @@ -0,0 +1,90 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from pathlib import Path + +import iso8601 + +from swh.lister.golang.lister import GolangLister +from swh.lister.tests.test_utils import assert_sleep_calls +from swh.lister.utils import WAIT_EXP_BASE + +# https://pkg.go.dev prefix omitted +expected_listed = [ + ("collectd.org", "2019-04-11T18:47:25.450546+00:00"), + ("github.com/blang/semver", "2019-04-15T13:54:39.107258+00:00",), + ("github.com/bmizerany/pat", "2019-04-11T18:47:29.390564+00:00",), + ("github.com/djherbis/buffer", "2019-04-11T18:47:29.974874+00:00",), + ("github.com/djherbis/nio", "2019-04-11T18:47:32.283312+00:00",), + ("github.com/gobuffalo/buffalo-plugins", "2019-04-15T13:54:34.222985+00:00",), + ("github.com/gobuffalo/buffalo-pop", "2019-04-15T13:54:39.135792+00:00",), + ("github.com/gobuffalo/clara", "2019-04-15T13:54:40.651916+00:00",), + ("github.com/gobuffalo/genny", "2019-04-15T13:54:37.841547+00:00",), + ("github.com/gobuffalo/packr", "2019-04-15T13:54:35.688900+00:00",), + ("github.com/markbates/refresh", "2019-04-15T13:54:35.250835+00:00",), + ("github.com/mitchellh/go-homedir", "2019-04-15T13:54:35.678214+00:00",), + ("github.com/nats-io/nuid", "2019-04-11T18:47:28.102348+00:00",), + ("github.com/oklog/ulid", "2019-04-11T18:47:23.234198+00:00",), + ("github.com/pkg/errors", "2019-04-18T02:07:41.336899+00:00",), + ("golang.org/x/sys", "2019-04-15T13:54:37.555525+00:00",), + ("golang.org/x/text", "2019-04-10T19:08:52.997264+00:00"), + # only one x/tools listed even though there are two version, and only the + # latest one's timestamp is used. + ("golang.org/x/tools", "2019-04-15T13:54:41.905064+00:00",), +] + + +def _generate_responses(datadir, requests_mock): + responses = [] + for file in Path(datadir).glob("page-*.txt"): + # Test that throttling and server errors are retries + responses.append({"text": "", "status_code": 429}) + responses.append({"text": "", "status_code": 500}) + # Also test that the lister appropriately gets out of the infinite loop + responses.append({"text": file.read_text(), "status_code": 200}) + + requests_mock.get(GolangLister.GOLANG_MODULES_INDEX_URL, responses) + + +def test_golang_lister(swh_scheduler, mocker, requests_mock, datadir): + # first listing, should return one origin per package + lister = GolangLister(scheduler=swh_scheduler) + + # Exponential retries take a long time, so stub time.sleep + mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep") + + _generate_responses(datadir, requests_mock) + + stats = lister.run() + + assert stats.pages == 3 + # The two `golang.org/x/tools` versions are *not* listed as separate origins + assert stats.origins == 18 + + scheduler_origins = sorted( + swh_scheduler.get_listed_origins(lister.lister_obj.id).results, + key=lambda x: x.url, + ) + + for scheduled, (url, timestamp) in zip(scheduler_origins, expected_listed): + assert scheduled.url == f"https://pkg.go.dev/{url}" + assert scheduled.last_update == iso8601.parse_date(timestamp) + assert scheduled.visit_type == "golang" + + assert len(scheduler_origins) == len(expected_listed) + + # Test `time.sleep` is called with exponential retries + assert_sleep_calls( + mocker, mocked_sleep, [1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE] + ) + + # doing it all again (without incremental) should give us the same result + lister = GolangLister(scheduler=swh_scheduler) + mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep") + _generate_responses(datadir, requests_mock) + stats = lister.run() + + assert stats.pages == 3 + assert stats.origins == 18 diff --git a/swh/lister/golang/tests/test_tasks.py b/swh/lister/golang/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/golang/tests/test_tasks.py @@ -0,0 +1,32 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.golang.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_golang_full_listing_task( + swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker +): + lister = mocker.patch("swh.lister.golang.tasks.GolangLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=1, origins=28000) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.golang.tasks.FullGolangLister") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with()