Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9345943
D7329.id29999.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
14 KB
Subscribers
None
D7329.id29999.diff
View Options
diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -17,6 +17,7 @@
- `swh.lister.github`
- `swh.lister.gitlab`
- `swh.lister.gnu`
+- `swh.lister.golang`
- `swh.lister.launchpad`
- `swh.lister.maven`
- `swh.lister.npm`
@@ -37,7 +38,7 @@
## lister configuration
Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`,
-`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
+`gitea`, `github`, `gitlab`, `gnu`, `golang`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
must be configured by following the instructions below (please note that you have to replace
`<lister_name>` by one of the lister name introduced above).
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -63,6 +63,7 @@
lister.github=swh.lister.github:register
lister.gitlab=swh.lister.gitlab:register
lister.gnu=swh.lister.gnu:register
+ lister.golang=swh.lister.golang:register
lister.launchpad=swh.lister.launchpad:register
lister.npm=swh.lister.npm:register
lister.opam=swh.lister.opam:register
diff --git a/swh/lister/golang/__init__.py b/swh/lister/golang/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (C) 2022 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .lister import GolangLister
+
+ return {
+ "lister": GolangLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/lister.py
@@ -0,0 +1,132 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from datetime import datetime
+import json
+import logging
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+
+import iso8601
+import requests
+from tenacity import before_sleep_log
+
+from swh.lister.utils import retry_policy_generic, throttling_retry
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from .. import USER_AGENT
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+GolangPageType = List[Dict[str, Any]]
+
+
+class GolangLister(StatelessLister[GolangPageType]):
+ """
+ List all Golang modules and send associated origins to scheduler.
+
+ The lister queries the Golang module index, whose documentation can be found
+ at https://index.golang.org
+ """
+
+ GOLANG_MODULES_INDEX_URL = "https://index.golang.org/index"
+ # `limit` seems to be... limited to 2000.
+ GOLANG_MODULES_INDEX_LIMIT = 2000
+ LISTER_NAME = "Golang"
+
+ def __init__(
+ self, scheduler: SchedulerInterface, credentials: CredentialsType = None,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ url=self.GOLANG_MODULES_INDEX_URL,
+ instance="Golang",
+ credentials=credentials,
+ )
+
+ self.session = requests.Session()
+ self.session.headers.update(
+ {"Accept": "application/json", "User-Agent": USER_AGENT}
+ )
+
+ @throttling_retry(
+ retry=retry_policy_generic,
+ before_sleep=before_sleep_log(logger, logging.WARNING),
+ )
+ def api_request(self, url: str) -> List[str]:
+ logger.debug("Fetching URL %s", url)
+
+ response = self.session.get(url)
+
+ if response.status_code not in (200, 304):
+ # Log response content to ease debugging
+ logger.warning(
+ "Unexpected HTTP status code %s for URL %s",
+ response.status_code,
+ response.url,
+ )
+
+ response.raise_for_status()
+
+ return response.text.split()
+
+ def get_single_page(
+ self, since: Optional[datetime] = None
+ ) -> Tuple[GolangPageType, Optional[datetime]]:
+ """Return a page from the API and the timestamp of its last entry.
+ Since all entries are sorted by chronological order, the timestamp is useful
+ both for pagination and later for incremental runs."""
+ url = f"{self.url}?limit={self.GOLANG_MODULES_INDEX_LIMIT}"
+ if since is not None:
+ # The Golang index does not understand `+00:00` for some reason
+ # and expects the "timezone zero" notation instead. This works
+ # because all times are UTC.
+ utc_offset = since.utcoffset()
+ assert (
+ utc_offset is not None and utc_offset.total_seconds() == 0
+ ), "Non-UTC datetime"
+ as_date = since.isoformat().replace("+00:00", "Z")
+ url = f"{url}&since={as_date}"
+
+ entries = self.api_request(url)
+ page: GolangPageType = []
+ if not entries:
+ return page, since
+
+ for as_json in entries:
+ entry = json.loads(as_json)
+ timestamp = iso8601.parse_date(entry["Timestamp"])
+ # We've already parsed it and we'll need the datetime later, save it
+ entry["Timestamp"] = timestamp
+ page.append(entry)
+ # The index is guaranteed to be sorted in chronological order
+ since = timestamp
+
+ return page, since
+
+ def get_pages(self) -> Iterator[GolangPageType]:
+ page, since = self.get_single_page()
+
+ while page:
+ yield page
+ page, since = self.get_single_page(since=since)
+
+ def get_origins_from_page(self, page: GolangPageType) -> Iterator[ListedOrigin]:
+ """
+ Iterate on all Golang projects and yield ListedOrigin instances.
+ """
+ assert self.lister_obj.id is not None
+
+ for module in page:
+ path = module["Path"]
+ # See https://proxy.golang.org for documentation on the proxy protocol
+ origin_url = f"https://proxy.golang.org/{path}"
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=origin_url,
+ visit_type="golang",
+ last_update=module["Timestamp"],
+ )
diff --git a/swh/lister/golang/tasks.py b/swh/lister/golang/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/tasks.py
@@ -0,0 +1,18 @@
+# Copyright (C) 2022 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from .lister import GolangLister
+
+
+@shared_task(name=__name__ + ".FullGolangLister")
+def list_golang(**lister_args):
+ "List the Golang module registry"
+ return GolangLister.from_configfile(**lister_args).run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping():
+ return "OK"
diff --git a/swh/lister/golang/tests/__init__.py b/swh/lister/golang/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/golang/tests/data/page-1.txt b/swh/lister/golang/tests/data/page-1.txt
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/tests/data/page-1.txt
@@ -0,0 +1,5 @@
+{"Path":"golang.org/x/text","Version":"v0.3.0","Timestamp":"2019-04-10T19:08:52.997264Z"}
+{"Path":"github.com/oklog/ulid","Version":"v1.3.1","Timestamp":"2019-04-11T18:47:23.234198Z"}
+{"Path":"collectd.org","Version":"v0.3.0","Timestamp":"2019-04-11T18:47:25.450546Z"}
+{"Path":"github.com/nats-io/nuid","Version":"v1.0.1","Timestamp":"2019-04-11T18:47:28.102348Z"}
+{"Path":"github.com/bmizerany/pat","Version":"v0.0.0-20170815010413-6226ea591a40","Timestamp":"2019-04-11T18:47:29.390564Z"}
\ No newline at end of file
diff --git a/swh/lister/golang/tests/data/page-2.txt b/swh/lister/golang/tests/data/page-2.txt
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/tests/data/page-2.txt
@@ -0,0 +1,4 @@
+{"Path":"github.com/djherbis/buffer","Version":"v1.0.0","Timestamp":"2019-04-11T18:47:29.974874Z"}
+{"Path":"github.com/djherbis/nio","Version":"v2.0.3+incompatible","Timestamp":"2019-04-11T18:47:32.283312Z"}
+{"Path":"github.com/gobuffalo/buffalo-plugins","Version":"v1.13.0","Timestamp":"2019-04-15T13:54:34.222985Z"}
+{"Path":"github.com/markbates/refresh","Version":"v1.7.1","Timestamp":"2019-04-15T13:54:35.250835Z"}
\ No newline at end of file
diff --git a/swh/lister/golang/tests/data/page-3.txt b/swh/lister/golang/tests/data/page-3.txt
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/tests/data/page-3.txt
@@ -0,0 +1,10 @@
+{"Path":"github.com/mitchellh/go-homedir","Version":"v1.1.0","Timestamp":"2019-04-15T13:54:35.678214Z"}
+{"Path":"github.com/gobuffalo/packr","Version":"v1.22.0","Timestamp":"2019-04-15T13:54:35.6889Z"}
+{"Path":"golang.org/x/sys","Version":"v0.0.0-20190220154126-629670e5acc5","Timestamp":"2019-04-15T13:54:37.555525Z"}
+{"Path":"github.com/gobuffalo/genny","Version":"v0.0.0-20190104222617-a71664fc38e7","Timestamp":"2019-04-15T13:54:37.841547Z"}
+{"Path":"github.com/blang/semver","Version":"v3.5.1+incompatible","Timestamp":"2019-04-15T13:54:39.107258Z"}
+{"Path":"github.com/gobuffalo/buffalo-pop","Version":"v1.3.0","Timestamp":"2019-04-15T13:54:39.135792Z"}
+{"Path":"golang.org/x/tools","Version":"v0.0.0-20190131142011-8dbcc66f33bb","Timestamp":"2019-04-15T13:54:39.250757Z"}
+{"Path":"github.com/gobuffalo/clara","Version":"v0.4.1","Timestamp":"2019-04-15T13:54:40.651916Z"}
+{"Path":"golang.org/x/tools","Version":"v0.0.0-20181213190329-bbccd8cae4a9","Timestamp":"2019-04-15T13:54:41.905064Z"}
+{"Path":"github.com/pkg/errors","Version":"v0.0.0-20161002052512-839d9e913e06","Timestamp":"2019-04-18T02:07:41.336899Z"}
\ No newline at end of file
diff --git a/swh/lister/golang/tests/test_lister.py b/swh/lister/golang/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/tests/test_lister.py
@@ -0,0 +1,78 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from pathlib import Path
+
+import iso8601
+
+from swh.lister.golang.lister import GolangLister
+from swh.lister.tests.test_utils import assert_sleep_calls
+from swh.lister.utils import WAIT_EXP_BASE
+
+# https://proxy.golang.org prefix omitted
+expected_listed = [
+ ("collectd.org", "2019-04-11T18:47:25.450546+00:00"),
+ ("github.com/blang/semver", "2019-04-15T13:54:39.107258+00:00",),
+ ("github.com/bmizerany/pat", "2019-04-11T18:47:29.390564+00:00",),
+ ("github.com/djherbis/buffer", "2019-04-11T18:47:29.974874+00:00",),
+ ("github.com/djherbis/nio", "2019-04-11T18:47:32.283312+00:00",),
+ ("github.com/gobuffalo/buffalo-plugins", "2019-04-15T13:54:34.222985+00:00",),
+ ("github.com/gobuffalo/buffalo-pop", "2019-04-15T13:54:39.135792+00:00",),
+ ("github.com/gobuffalo/clara", "2019-04-15T13:54:40.651916+00:00",),
+ ("github.com/gobuffalo/genny", "2019-04-15T13:54:37.841547+00:00",),
+ ("github.com/gobuffalo/packr", "2019-04-15T13:54:35.688900+00:00",),
+ ("github.com/markbates/refresh", "2019-04-15T13:54:35.250835+00:00",),
+ ("github.com/mitchellh/go-homedir", "2019-04-15T13:54:35.678214+00:00",),
+ ("github.com/nats-io/nuid", "2019-04-11T18:47:28.102348+00:00",),
+ ("github.com/oklog/ulid", "2019-04-11T18:47:23.234198+00:00",),
+ ("github.com/pkg/errors", "2019-04-18T02:07:41.336899+00:00",),
+ ("golang.org/x/sys", "2019-04-15T13:54:37.555525+00:00",),
+ ("golang.org/x/text", "2019-04-10T19:08:52.997264+00:00"),
+ # only one x/tools listed even though there are two version, and only the
+ # latest one's timestamp is used.
+ ("golang.org/x/tools", "2019-04-15T13:54:41.905064+00:00",),
+]
+
+
+def test_golang_lister(swh_scheduler, mocker, requests_mock, datadir):
+ # first listing, should return one origin per package
+ lister = GolangLister(scheduler=swh_scheduler)
+
+ # Exponential retries take a long time, so stub time.sleep
+ mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep")
+
+ responses = []
+ for file in Path(datadir).glob("page-*.txt"):
+ # Test that throttling and server errors are retries
+ responses.append({"text": "", "status_code": 429})
+ responses.append({"text": "", "status_code": 500})
+ responses.append({"text": file.read_text(), "status_code": 200})
+
+ # Returns empty text when the list is exhausted
+ responses.append({"text": "", "status_code": 200})
+ requests_mock.get(GolangLister.GOLANG_MODULES_INDEX_URL, responses)
+
+ stats = lister.run()
+
+ assert stats.pages == 3
+ # The two `golang.org/x/tools` versions are *not* listed as separate origins
+ assert stats.origins == 18
+
+ scheduler_origins = sorted(
+ swh_scheduler.get_listed_origins(lister.lister_obj.id).results,
+ key=lambda x: x.url,
+ )
+
+ for scheduled, (url, timestamp) in zip(scheduler_origins, expected_listed):
+ assert scheduled.url == f"https://proxy.golang.org/{url}"
+ assert scheduled.last_update == iso8601.parse_date(timestamp)
+ assert scheduled.visit_type == "golang"
+
+ assert len(scheduler_origins) == len(expected_listed)
+
+ # Test `time.sleep` is called with exponential retries
+ assert_sleep_calls(
+ mocker, mocked_sleep, [1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE]
+ )
diff --git a/swh/lister/golang/tests/test_tasks.py b/swh/lister/golang/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/tests/test_tasks.py
@@ -0,0 +1,32 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.golang.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_golang_full_listing_task(
+ swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
+):
+ lister = mocker.patch("swh.lister.golang.tasks.GolangLister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=1, origins=28000)
+ lister.run.return_value = stats
+
+ res = swh_scheduler_celery_app.send_task("swh.lister.golang.tasks.FullGolangLister")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == stats.dict()
+
+ lister.from_configfile.assert_called_once_with()
+ lister.run.assert_called_once_with()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 3:37 PM (1 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3227310
Attached To
D7329: Non-incremental Golang module lister
Event Timeline
Log In to Comment