Page MenuHomeSoftware Heritage

D7329.id29999.diff
No OneTemporary

D7329.id29999.diff

diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -17,6 +17,7 @@
- `swh.lister.github`
- `swh.lister.gitlab`
- `swh.lister.gnu`
+- `swh.lister.golang`
- `swh.lister.launchpad`
- `swh.lister.maven`
- `swh.lister.npm`
@@ -37,7 +38,7 @@
## lister configuration
Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`,
-`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
+`gitea`, `github`, `gitlab`, `gnu`, `golang`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`)
must be configured by following the instructions below (please note that you have to replace
`<lister_name>` by one of the lister name introduced above).
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -63,6 +63,7 @@
lister.github=swh.lister.github:register
lister.gitlab=swh.lister.gitlab:register
lister.gnu=swh.lister.gnu:register
+ lister.golang=swh.lister.golang:register
lister.launchpad=swh.lister.launchpad:register
lister.npm=swh.lister.npm:register
lister.opam=swh.lister.opam:register
diff --git a/swh/lister/golang/__init__.py b/swh/lister/golang/__init__.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (C) 2022 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+def register():
+ from .lister import GolangLister
+
+ return {
+ "lister": GolangLister,
+ "task_modules": ["%s.tasks" % __name__],
+ }
diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/lister.py
@@ -0,0 +1,132 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from datetime import datetime
+import json
+import logging
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+
+import iso8601
+import requests
+from tenacity import before_sleep_log
+
+from swh.lister.utils import retry_policy_generic, throttling_retry
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from .. import USER_AGENT
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+GolangPageType = List[Dict[str, Any]]
+
+
+class GolangLister(StatelessLister[GolangPageType]):
+ """
+ List all Golang modules and send associated origins to scheduler.
+
+ The lister queries the Golang module index, whose documentation can be found
+ at https://index.golang.org
+ """
+
+ GOLANG_MODULES_INDEX_URL = "https://index.golang.org/index"
+ # `limit` seems to be... limited to 2000.
+ GOLANG_MODULES_INDEX_LIMIT = 2000
+ LISTER_NAME = "Golang"
+
+ def __init__(
+ self, scheduler: SchedulerInterface, credentials: CredentialsType = None,
+ ):
+ super().__init__(
+ scheduler=scheduler,
+ url=self.GOLANG_MODULES_INDEX_URL,
+ instance="Golang",
+ credentials=credentials,
+ )
+
+ self.session = requests.Session()
+ self.session.headers.update(
+ {"Accept": "application/json", "User-Agent": USER_AGENT}
+ )
+
+ @throttling_retry(
+ retry=retry_policy_generic,
+ before_sleep=before_sleep_log(logger, logging.WARNING),
+ )
+ def api_request(self, url: str) -> List[str]:
+ logger.debug("Fetching URL %s", url)
+
+ response = self.session.get(url)
+
+ if response.status_code not in (200, 304):
+ # Log response content to ease debugging
+ logger.warning(
+ "Unexpected HTTP status code %s for URL %s",
+ response.status_code,
+ response.url,
+ )
+
+ response.raise_for_status()
+
+ return response.text.split()
+
+ def get_single_page(
+ self, since: Optional[datetime] = None
+ ) -> Tuple[GolangPageType, Optional[datetime]]:
+ """Return a page from the API and the timestamp of its last entry.
+ Since all entries are sorted by chronological order, the timestamp is useful
+ both for pagination and later for incremental runs."""
+ url = f"{self.url}?limit={self.GOLANG_MODULES_INDEX_LIMIT}"
+ if since is not None:
+ # The Golang index does not understand `+00:00` for some reason
+ # and expects the "timezone zero" notation instead. This works
+ # because all times are UTC.
+ utc_offset = since.utcoffset()
+ assert (
+ utc_offset is not None and utc_offset.total_seconds() == 0
+ ), "Non-UTC datetime"
+ as_date = since.isoformat().replace("+00:00", "Z")
+ url = f"{url}&since={as_date}"
+
+ entries = self.api_request(url)
+ page: GolangPageType = []
+ if not entries:
+ return page, since
+
+ for as_json in entries:
+ entry = json.loads(as_json)
+ timestamp = iso8601.parse_date(entry["Timestamp"])
+ # We've already parsed it and we'll need the datetime later, save it
+ entry["Timestamp"] = timestamp
+ page.append(entry)
+ # The index is guaranteed to be sorted in chronological order
+ since = timestamp
+
+ return page, since
+
+ def get_pages(self) -> Iterator[GolangPageType]:
+ page, since = self.get_single_page()
+
+ while page:
+ yield page
+ page, since = self.get_single_page(since=since)
+
+ def get_origins_from_page(self, page: GolangPageType) -> Iterator[ListedOrigin]:
+ """
+ Iterate on all Golang projects and yield ListedOrigin instances.
+ """
+ assert self.lister_obj.id is not None
+
+ for module in page:
+ path = module["Path"]
+ # See https://proxy.golang.org for documentation on the proxy protocol
+ origin_url = f"https://proxy.golang.org/{path}"
+ yield ListedOrigin(
+ lister_id=self.lister_obj.id,
+ url=origin_url,
+ visit_type="golang",
+ last_update=module["Timestamp"],
+ )
diff --git a/swh/lister/golang/tasks.py b/swh/lister/golang/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/tasks.py
@@ -0,0 +1,18 @@
+# Copyright (C) 2022 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from .lister import GolangLister
+
+
+@shared_task(name=__name__ + ".FullGolangLister")
+def list_golang(**lister_args):
+ "List the Golang module registry"
+ return GolangLister.from_configfile(**lister_args).run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping():
+ return "OK"
diff --git a/swh/lister/golang/tests/__init__.py b/swh/lister/golang/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/golang/tests/data/page-1.txt b/swh/lister/golang/tests/data/page-1.txt
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/tests/data/page-1.txt
@@ -0,0 +1,5 @@
+{"Path":"golang.org/x/text","Version":"v0.3.0","Timestamp":"2019-04-10T19:08:52.997264Z"}
+{"Path":"github.com/oklog/ulid","Version":"v1.3.1","Timestamp":"2019-04-11T18:47:23.234198Z"}
+{"Path":"collectd.org","Version":"v0.3.0","Timestamp":"2019-04-11T18:47:25.450546Z"}
+{"Path":"github.com/nats-io/nuid","Version":"v1.0.1","Timestamp":"2019-04-11T18:47:28.102348Z"}
+{"Path":"github.com/bmizerany/pat","Version":"v0.0.0-20170815010413-6226ea591a40","Timestamp":"2019-04-11T18:47:29.390564Z"}
\ No newline at end of file
diff --git a/swh/lister/golang/tests/data/page-2.txt b/swh/lister/golang/tests/data/page-2.txt
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/tests/data/page-2.txt
@@ -0,0 +1,4 @@
+{"Path":"github.com/djherbis/buffer","Version":"v1.0.0","Timestamp":"2019-04-11T18:47:29.974874Z"}
+{"Path":"github.com/djherbis/nio","Version":"v2.0.3+incompatible","Timestamp":"2019-04-11T18:47:32.283312Z"}
+{"Path":"github.com/gobuffalo/buffalo-plugins","Version":"v1.13.0","Timestamp":"2019-04-15T13:54:34.222985Z"}
+{"Path":"github.com/markbates/refresh","Version":"v1.7.1","Timestamp":"2019-04-15T13:54:35.250835Z"}
\ No newline at end of file
diff --git a/swh/lister/golang/tests/data/page-3.txt b/swh/lister/golang/tests/data/page-3.txt
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/tests/data/page-3.txt
@@ -0,0 +1,10 @@
+{"Path":"github.com/mitchellh/go-homedir","Version":"v1.1.0","Timestamp":"2019-04-15T13:54:35.678214Z"}
+{"Path":"github.com/gobuffalo/packr","Version":"v1.22.0","Timestamp":"2019-04-15T13:54:35.6889Z"}
+{"Path":"golang.org/x/sys","Version":"v0.0.0-20190220154126-629670e5acc5","Timestamp":"2019-04-15T13:54:37.555525Z"}
+{"Path":"github.com/gobuffalo/genny","Version":"v0.0.0-20190104222617-a71664fc38e7","Timestamp":"2019-04-15T13:54:37.841547Z"}
+{"Path":"github.com/blang/semver","Version":"v3.5.1+incompatible","Timestamp":"2019-04-15T13:54:39.107258Z"}
+{"Path":"github.com/gobuffalo/buffalo-pop","Version":"v1.3.0","Timestamp":"2019-04-15T13:54:39.135792Z"}
+{"Path":"golang.org/x/tools","Version":"v0.0.0-20190131142011-8dbcc66f33bb","Timestamp":"2019-04-15T13:54:39.250757Z"}
+{"Path":"github.com/gobuffalo/clara","Version":"v0.4.1","Timestamp":"2019-04-15T13:54:40.651916Z"}
+{"Path":"golang.org/x/tools","Version":"v0.0.0-20181213190329-bbccd8cae4a9","Timestamp":"2019-04-15T13:54:41.905064Z"}
+{"Path":"github.com/pkg/errors","Version":"v0.0.0-20161002052512-839d9e913e06","Timestamp":"2019-04-18T02:07:41.336899Z"}
\ No newline at end of file
diff --git a/swh/lister/golang/tests/test_lister.py b/swh/lister/golang/tests/test_lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/tests/test_lister.py
@@ -0,0 +1,78 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from pathlib import Path
+
+import iso8601
+
+from swh.lister.golang.lister import GolangLister
+from swh.lister.tests.test_utils import assert_sleep_calls
+from swh.lister.utils import WAIT_EXP_BASE
+
+# https://proxy.golang.org prefix omitted
+expected_listed = [
+ ("collectd.org", "2019-04-11T18:47:25.450546+00:00"),
+ ("github.com/blang/semver", "2019-04-15T13:54:39.107258+00:00",),
+ ("github.com/bmizerany/pat", "2019-04-11T18:47:29.390564+00:00",),
+ ("github.com/djherbis/buffer", "2019-04-11T18:47:29.974874+00:00",),
+ ("github.com/djherbis/nio", "2019-04-11T18:47:32.283312+00:00",),
+ ("github.com/gobuffalo/buffalo-plugins", "2019-04-15T13:54:34.222985+00:00",),
+ ("github.com/gobuffalo/buffalo-pop", "2019-04-15T13:54:39.135792+00:00",),
+ ("github.com/gobuffalo/clara", "2019-04-15T13:54:40.651916+00:00",),
+ ("github.com/gobuffalo/genny", "2019-04-15T13:54:37.841547+00:00",),
+ ("github.com/gobuffalo/packr", "2019-04-15T13:54:35.688900+00:00",),
+ ("github.com/markbates/refresh", "2019-04-15T13:54:35.250835+00:00",),
+ ("github.com/mitchellh/go-homedir", "2019-04-15T13:54:35.678214+00:00",),
+ ("github.com/nats-io/nuid", "2019-04-11T18:47:28.102348+00:00",),
+ ("github.com/oklog/ulid", "2019-04-11T18:47:23.234198+00:00",),
+ ("github.com/pkg/errors", "2019-04-18T02:07:41.336899+00:00",),
+ ("golang.org/x/sys", "2019-04-15T13:54:37.555525+00:00",),
+ ("golang.org/x/text", "2019-04-10T19:08:52.997264+00:00"),
+ # only one x/tools listed even though there are two version, and only the
+ # latest one's timestamp is used.
+ ("golang.org/x/tools", "2019-04-15T13:54:41.905064+00:00",),
+]
+
+
+def test_golang_lister(swh_scheduler, mocker, requests_mock, datadir):
+ # first listing, should return one origin per package
+ lister = GolangLister(scheduler=swh_scheduler)
+
+ # Exponential retries take a long time, so stub time.sleep
+ mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep")
+
+ responses = []
+ for file in Path(datadir).glob("page-*.txt"):
+ # Test that throttling and server errors are retries
+ responses.append({"text": "", "status_code": 429})
+ responses.append({"text": "", "status_code": 500})
+ responses.append({"text": file.read_text(), "status_code": 200})
+
+ # Returns empty text when the list is exhausted
+ responses.append({"text": "", "status_code": 200})
+ requests_mock.get(GolangLister.GOLANG_MODULES_INDEX_URL, responses)
+
+ stats = lister.run()
+
+ assert stats.pages == 3
+ # The two `golang.org/x/tools` versions are *not* listed as separate origins
+ assert stats.origins == 18
+
+ scheduler_origins = sorted(
+ swh_scheduler.get_listed_origins(lister.lister_obj.id).results,
+ key=lambda x: x.url,
+ )
+
+ for scheduled, (url, timestamp) in zip(scheduler_origins, expected_listed):
+ assert scheduled.url == f"https://proxy.golang.org/{url}"
+ assert scheduled.last_update == iso8601.parse_date(timestamp)
+ assert scheduled.visit_type == "golang"
+
+ assert len(scheduler_origins) == len(expected_listed)
+
+ # Test `time.sleep` is called with exponential retries
+ assert_sleep_calls(
+ mocker, mocked_sleep, [1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE]
+ )
diff --git a/swh/lister/golang/tests/test_tasks.py b/swh/lister/golang/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/golang/tests/test_tasks.py
@@ -0,0 +1,32 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+ res = swh_scheduler_celery_app.send_task("swh.lister.golang.tasks.ping")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == "OK"
+
+
+def test_golang_full_listing_task(
+ swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
+):
+ lister = mocker.patch("swh.lister.golang.tasks.GolangLister")
+ lister.from_configfile.return_value = lister
+ stats = ListerStats(pages=1, origins=28000)
+ lister.run.return_value = stats
+
+ res = swh_scheduler_celery_app.send_task("swh.lister.golang.tasks.FullGolangLister")
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == stats.dict()
+
+ lister.from_configfile.assert_called_once_with()
+ lister.run.assert_called_once_with()

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:37 PM (1 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3227310

Event Timeline