diff --git a/README.md b/README.md index 36434dd..f54483f 100644 --- a/README.md +++ b/README.md @@ -1,103 +1,104 @@ swh-lister ========== This component from the Software Heritage stack aims to produce listings of software origins and their urls hosted on various public developer platforms or package managers. As these operations are quite similar, it provides a set of Python modules abstracting common software origins listing behaviors. It also provides several lister implementations, contained in the following Python modules: - `swh.lister.bitbucket` - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` +- `swh.lister.golang` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` - `swh.lister.gogs` Dependencies ------------ All required dependencies can be found in the `requirements*.txt` files located at the root of the repository. Local deployment ---------------- ## lister configuration Each lister implemented so far by Software Heritage (`bitbucket`, `cgit`, `cran`, `debian`, -`gitea`, `github`, `gitlab`, `gnu`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) +`gitea`, `github`, `gitlab`, `gnu`, `golang`, `launchpad`, `npm`, `packagist`, `phabricator`, `pypi`, `tuleap`, `maven`) must be configured by following the instructions below (please note that you have to replace `` by one of the lister name introduced above). ### Preparation steps 1. `mkdir ~/.config/swh/` 2. create configuration file `~/.config/swh/listers.yml` ### Configuration file sample Minimalistic configuration shared by all listers to add in file `~/.config/swh/listers.yml`: ```lang=yml scheduler: cls: 'remote' args: url: 'http://localhost:5008/' credentials: {} ``` Note: This expects scheduler (5008) service to run locally ## Executing a lister Once configured, a lister can be executed by using the `swh` CLI tool with the following options and commands: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister [lister_parameters] ``` Examples: ``` $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister bitbucket $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister cran $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitea url=https://codeberg.org/api/v1/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister gitlab url=https://salsa.debian.org/api/v4/ $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister npm $ swh --log-level DEBUG lister -C ~/.config/swh/listers.yml run --lister pypi ``` Licensing --------- This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. diff --git a/setup.py b/setup.py index 2dc4760..8d3d7dd 100755 --- a/setup.py +++ b/setup.py @@ -1,95 +1,96 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from io import open from os import path from setuptools import find_packages, setup here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( name="swh.lister", description="Software Heritage lister", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DLSGH/", packages=find_packages(), install_requires=parse_requirements() + parse_requirements("swh"), tests_require=parse_requirements("test"), setup_requires=["setuptools-scm"], extras_require={"testing": parse_requirements("test")}, use_scm_version=True, include_package_data=True, entry_points=""" [swh.cli.subcommands] lister=swh.lister.cli [swh.workers] lister.arch=swh.lister.arch:register lister.aur=swh.lister.aur:register lister.bitbucket=swh.lister.bitbucket:register lister.bower=swh.lister.bower:register lister.cgit=swh.lister.cgit:register lister.cran=swh.lister.cran:register lister.crates=swh.lister.crates:register lister.debian=swh.lister.debian:register lister.gitea=swh.lister.gitea:register lister.github=swh.lister.github:register lister.gitlab=swh.lister.gitlab:register lister.gnu=swh.lister.gnu:register + lister.golang=swh.lister.golang:register lister.launchpad=swh.lister.launchpad:register lister.npm=swh.lister.npm:register lister.opam=swh.lister.opam:register lister.packagist=swh.lister.packagist:register lister.phabricator=swh.lister.phabricator:register lister.pubdev=swh.lister.pubdev:register lister.pypi=swh.lister.pypi:register lister.sourceforge=swh.lister.sourceforge:register lister.tuleap=swh.lister.tuleap:register lister.maven=swh.lister.maven:register lister.gogs=swh.lister.gogs:register """, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-lister", "Documentation": "https://docs.softwareheritage.org/devel/swh-lister/", }, ) diff --git a/swh/lister/golang/__init__.py b/swh/lister/golang/__init__.py new file mode 100644 index 0000000..fe20282 --- /dev/null +++ b/swh/lister/golang/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import GolangLister + + return { + "lister": GolangLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py new file mode 100644 index 0000000..e0fb2db --- /dev/null +++ b/swh/lister/golang/lister.py @@ -0,0 +1,145 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime +import json +import logging +from typing import Any, Dict, Iterator, List, Optional, Tuple + +import iso8601 +import requests +from tenacity import before_sleep_log + +from swh.lister.utils import retry_policy_generic, throttling_retry +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from .. import USER_AGENT +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +GolangPageType = List[Dict[str, Any]] + + +class GolangLister(StatelessLister[GolangPageType]): + """ + List all Golang modules and send associated origins to scheduler. + + The lister queries the Golang module index, whose documentation can be found + at https://index.golang.org + """ + + GOLANG_MODULES_INDEX_URL = "https://index.golang.org/index" + # `limit` seems to be... limited to 2000. + GOLANG_MODULES_INDEX_LIMIT = 2000 + LISTER_NAME = "Golang" + + def __init__( + self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, + url=self.GOLANG_MODULES_INDEX_URL, + instance="Golang", + credentials=credentials, + ) + + self.session = requests.Session() + self.session.headers.update( + {"Accept": "application/json", "User-Agent": USER_AGENT} + ) + + @throttling_retry( + retry=retry_policy_generic, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + def api_request(self, url: str) -> List[str]: + logger.debug("Fetching URL %s", url) + + response = self.session.get(url) + + if response.status_code not in (200, 304): + # Log response content to ease debugging + logger.warning( + "Unexpected HTTP status code %s for URL %s", + response.status_code, + response.url, + ) + + response.raise_for_status() + + return response.text.split() + + def get_single_page( + self, since: Optional[datetime] = None + ) -> Tuple[GolangPageType, Optional[datetime]]: + """Return a page from the API and the timestamp of its last entry. + Since all entries are sorted by chronological order, the timestamp is useful + both for pagination and later for incremental runs.""" + url = f"{self.url}?limit={self.GOLANG_MODULES_INDEX_LIMIT}" + if since is not None: + # The Golang index does not understand `+00:00` for some reason + # and expects the "timezone zero" notation instead. This works + # because all times are UTC. + utc_offset = since.utcoffset() + assert ( + utc_offset is not None and utc_offset.total_seconds() == 0 + ), "Non-UTC datetime" + as_date = since.isoformat().replace("+00:00", "Z") + url = f"{url}&since={as_date}" + + entries = self.api_request(url) + page: GolangPageType = [] + if not entries: + return page, since + + for as_json in entries: + entry = json.loads(as_json) + timestamp = iso8601.parse_date(entry["Timestamp"]) + # We've already parsed it and we'll need the datetime later, save it + entry["Timestamp"] = timestamp + page.append(entry) + # The index is guaranteed to be sorted in chronological order + since = timestamp + + return page, since + + def get_pages(self) -> Iterator[GolangPageType]: + page, since = self.get_single_page() + last_since = since + while page: + yield page + page, since = self.get_single_page(since=since) + if last_since == since: + # The index returns packages whose timestamp are greater or + # equal to the date provided as parameter, which will create + # an infinite loop if not stopped here. + return [] + last_since = since + + def get_origins_from_page(self, page: GolangPageType) -> Iterator[ListedOrigin]: + """ + Iterate on all Golang projects and yield ListedOrigin instances. + """ + assert self.lister_obj.id is not None + + for module in page: + path = module["Path"] + # The loader will be expected to use the golang proxy to do the + # actual downloading. We're using `pkg.go.dev` so that the URL points + # to somewhere useful for a human instead of an (incomplete) API path. + origin_url = f"https://pkg.go.dev/{path}" + + # Since the Go index lists versions and not just packages, there will + # be duplicates. Fortunately, `ListedOrigins` are "upserted" server-side, + # so only the last timestamp will be used, with no duplicates. + # Performance should not be an issue as they are sent to the db in bulk. + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="golang", + last_update=module["Timestamp"], + ) diff --git a/swh/lister/golang/tasks.py b/swh/lister/golang/tasks.py new file mode 100644 index 0000000..bc7b895 --- /dev/null +++ b/swh/lister/golang/tasks.py @@ -0,0 +1,18 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from .lister import GolangLister + + +@shared_task(name=__name__ + ".FullGolangLister") +def list_golang(**lister_args): + "List the Golang module registry" + return GolangLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/golang/tests/__init__.py b/swh/lister/golang/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/golang/tests/data/page-1.txt b/swh/lister/golang/tests/data/page-1.txt new file mode 100644 index 0000000..b699d0b --- /dev/null +++ b/swh/lister/golang/tests/data/page-1.txt @@ -0,0 +1,5 @@ +{"Path":"golang.org/x/text","Version":"v0.3.0","Timestamp":"2019-04-10T19:08:52.997264Z"} +{"Path":"github.com/oklog/ulid","Version":"v1.3.1","Timestamp":"2019-04-11T18:47:23.234198Z"} +{"Path":"collectd.org","Version":"v0.3.0","Timestamp":"2019-04-11T18:47:25.450546Z"} +{"Path":"github.com/nats-io/nuid","Version":"v1.0.1","Timestamp":"2019-04-11T18:47:28.102348Z"} +{"Path":"github.com/bmizerany/pat","Version":"v0.0.0-20170815010413-6226ea591a40","Timestamp":"2019-04-11T18:47:29.390564Z"} \ No newline at end of file diff --git a/swh/lister/golang/tests/data/page-2.txt b/swh/lister/golang/tests/data/page-2.txt new file mode 100644 index 0000000..badc2fe --- /dev/null +++ b/swh/lister/golang/tests/data/page-2.txt @@ -0,0 +1,4 @@ +{"Path":"github.com/djherbis/buffer","Version":"v1.0.0","Timestamp":"2019-04-11T18:47:29.974874Z"} +{"Path":"github.com/djherbis/nio","Version":"v2.0.3+incompatible","Timestamp":"2019-04-11T18:47:32.283312Z"} +{"Path":"github.com/gobuffalo/buffalo-plugins","Version":"v1.13.0","Timestamp":"2019-04-15T13:54:34.222985Z"} +{"Path":"github.com/markbates/refresh","Version":"v1.7.1","Timestamp":"2019-04-15T13:54:35.250835Z"} \ No newline at end of file diff --git a/swh/lister/golang/tests/data/page-3.txt b/swh/lister/golang/tests/data/page-3.txt new file mode 100644 index 0000000..37e9d96 --- /dev/null +++ b/swh/lister/golang/tests/data/page-3.txt @@ -0,0 +1,10 @@ +{"Path":"github.com/mitchellh/go-homedir","Version":"v1.1.0","Timestamp":"2019-04-15T13:54:35.678214Z"} +{"Path":"github.com/gobuffalo/packr","Version":"v1.22.0","Timestamp":"2019-04-15T13:54:35.6889Z"} +{"Path":"golang.org/x/sys","Version":"v0.0.0-20190220154126-629670e5acc5","Timestamp":"2019-04-15T13:54:37.555525Z"} +{"Path":"github.com/gobuffalo/genny","Version":"v0.0.0-20190104222617-a71664fc38e7","Timestamp":"2019-04-15T13:54:37.841547Z"} +{"Path":"github.com/blang/semver","Version":"v3.5.1+incompatible","Timestamp":"2019-04-15T13:54:39.107258Z"} +{"Path":"github.com/gobuffalo/buffalo-pop","Version":"v1.3.0","Timestamp":"2019-04-15T13:54:39.135792Z"} +{"Path":"golang.org/x/tools","Version":"v0.0.0-20190131142011-8dbcc66f33bb","Timestamp":"2019-04-15T13:54:39.250757Z"} +{"Path":"github.com/gobuffalo/clara","Version":"v0.4.1","Timestamp":"2019-04-15T13:54:40.651916Z"} +{"Path":"golang.org/x/tools","Version":"v0.0.0-20181213190329-bbccd8cae4a9","Timestamp":"2019-04-15T13:54:41.905064Z"} +{"Path":"github.com/pkg/errors","Version":"v0.0.0-20161002052512-839d9e913e06","Timestamp":"2019-04-18T02:07:41.336899Z"} \ No newline at end of file diff --git a/swh/lister/golang/tests/test_lister.py b/swh/lister/golang/tests/test_lister.py new file mode 100644 index 0000000..9e9096a --- /dev/null +++ b/swh/lister/golang/tests/test_lister.py @@ -0,0 +1,90 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from pathlib import Path + +import iso8601 + +from swh.lister.golang.lister import GolangLister +from swh.lister.tests.test_utils import assert_sleep_calls +from swh.lister.utils import WAIT_EXP_BASE + +# https://pkg.go.dev prefix omitted +expected_listed = [ + ("collectd.org", "2019-04-11T18:47:25.450546+00:00"), + ("github.com/blang/semver", "2019-04-15T13:54:39.107258+00:00",), + ("github.com/bmizerany/pat", "2019-04-11T18:47:29.390564+00:00",), + ("github.com/djherbis/buffer", "2019-04-11T18:47:29.974874+00:00",), + ("github.com/djherbis/nio", "2019-04-11T18:47:32.283312+00:00",), + ("github.com/gobuffalo/buffalo-plugins", "2019-04-15T13:54:34.222985+00:00",), + ("github.com/gobuffalo/buffalo-pop", "2019-04-15T13:54:39.135792+00:00",), + ("github.com/gobuffalo/clara", "2019-04-15T13:54:40.651916+00:00",), + ("github.com/gobuffalo/genny", "2019-04-15T13:54:37.841547+00:00",), + ("github.com/gobuffalo/packr", "2019-04-15T13:54:35.688900+00:00",), + ("github.com/markbates/refresh", "2019-04-15T13:54:35.250835+00:00",), + ("github.com/mitchellh/go-homedir", "2019-04-15T13:54:35.678214+00:00",), + ("github.com/nats-io/nuid", "2019-04-11T18:47:28.102348+00:00",), + ("github.com/oklog/ulid", "2019-04-11T18:47:23.234198+00:00",), + ("github.com/pkg/errors", "2019-04-18T02:07:41.336899+00:00",), + ("golang.org/x/sys", "2019-04-15T13:54:37.555525+00:00",), + ("golang.org/x/text", "2019-04-10T19:08:52.997264+00:00"), + # only one x/tools listed even though there are two version, and only the + # latest one's timestamp is used. + ("golang.org/x/tools", "2019-04-15T13:54:41.905064+00:00",), +] + + +def _generate_responses(datadir, requests_mock): + responses = [] + for file in Path(datadir).glob("page-*.txt"): + # Test that throttling and server errors are retries + responses.append({"text": "", "status_code": 429}) + responses.append({"text": "", "status_code": 500}) + # Also test that the lister appropriately gets out of the infinite loop + responses.append({"text": file.read_text(), "status_code": 200}) + + requests_mock.get(GolangLister.GOLANG_MODULES_INDEX_URL, responses) + + +def test_golang_lister(swh_scheduler, mocker, requests_mock, datadir): + # first listing, should return one origin per package + lister = GolangLister(scheduler=swh_scheduler) + + # Exponential retries take a long time, so stub time.sleep + mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep") + + _generate_responses(datadir, requests_mock) + + stats = lister.run() + + assert stats.pages == 3 + # The two `golang.org/x/tools` versions are *not* listed as separate origins + assert stats.origins == 18 + + scheduler_origins = sorted( + swh_scheduler.get_listed_origins(lister.lister_obj.id).results, + key=lambda x: x.url, + ) + + for scheduled, (url, timestamp) in zip(scheduler_origins, expected_listed): + assert scheduled.url == f"https://pkg.go.dev/{url}" + assert scheduled.last_update == iso8601.parse_date(timestamp) + assert scheduled.visit_type == "golang" + + assert len(scheduler_origins) == len(expected_listed) + + # Test `time.sleep` is called with exponential retries + assert_sleep_calls( + mocker, mocked_sleep, [1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE, 1, WAIT_EXP_BASE] + ) + + # doing it all again (without incremental) should give us the same result + lister = GolangLister(scheduler=swh_scheduler) + mocked_sleep = mocker.patch.object(lister.api_request.retry, "sleep") + _generate_responses(datadir, requests_mock) + stats = lister.run() + + assert stats.pages == 3 + assert stats.origins == 18 diff --git a/swh/lister/golang/tests/test_tasks.py b/swh/lister/golang/tests/test_tasks.py new file mode 100644 index 0000000..414cce4 --- /dev/null +++ b/swh/lister/golang/tests/test_tasks.py @@ -0,0 +1,32 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.golang.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_golang_full_listing_task( + swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker +): + lister = mocker.patch("swh.lister.golang.tasks.GolangLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=1, origins=28000) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.golang.tasks.FullGolangLister") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with()