diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -54,6 +54,7 @@ include_package_data=True, entry_points=""" [swh.loader.metadata] + gitea=swh.loader.metadata.gitea:GiteaMetadataFetcher github=swh.loader.metadata.github:GitHubMetadataFetcher """, classifiers=[ diff --git a/swh/loader/metadata/cli.py b/swh/loader/metadata/cli.py new file mode 100644 --- /dev/null +++ b/swh/loader/metadata/cli.py @@ -0,0 +1,9 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# WARNING: do not import unnecessary things here to keep cli startup time under +# control + +from swh.loader.cli import loader as loader_cli_group diff --git a/swh/loader/metadata/gitea.py b/swh/loader/metadata/gitea.py new file mode 100644 --- /dev/null +++ b/swh/loader/metadata/gitea.py @@ -0,0 +1,135 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""Metadata loader for Gitea and Gogs. + +While both Gitea and Gogs currently return similar formats, they are implemented +as separate classes, to avoid losing provenance information in case they diverge +without notice in the future.""" + +import json +import logging +import random +import re +from typing import List, Optional, Tuple +import urllib.parse + +import requests + +from swh.model.model import Origin + +from . import USER_AGENT +from .base import BaseMetadataFetcher, InvalidOrigin + +HTTP_ACCEPT = "application/json" +"""HTTP header sent on all API requests to GitHub.""" + +logger = logging.getLogger(__name__) + + +class _BaseGiteaMetadataFetcher(BaseMetadataFetcher): + _session: Optional[requests.Session] = None + + METADATA_FORMAT: str + """Value of the ``format`` field of produced + :class:`swh.model.model.RawExtrinsicMetadata` objects.""" + + api_token: Optional[str] + + def session(self) -> requests.Session: + if self._session is None: + self._session = requests.Session() + if len(self.credentials) > 0: + cred = random.choice(self.credentials) + username = cred.get("username") + self.api_token = cred["password"] + logger.info("Using authentication credentials from user %s", username) + else: + # Raises an error on Gogs, or a warning on Gitea + self.on_anonymous_mode() + self.api_token = None + + self._session = requests.Session() + self._session.headers.update( + { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + ) + + if self.api_token: + self._session.headers["Authorization"] = f"token {self.api_token}" + + return self._session + + def on_anonymous_mode(self) -> None: + raise NotImplementedError(f"{self.__class__.__name__}.on_anonymous_mode()") + + def _check_origin(self): + (scheme, netloc, path, query, fragment) = urllib.parse.urlsplit(self.origin.url) + + if scheme != "https" or not re.match(r"/[^\s/]+/[^\s/]+(\.git)?", path): + raise InvalidOrigin(f"Unsupported Gitea/Gogs URL: {self.origin.url}") + + if query != "" or fragment != "": + raise InvalidOrigin( + f"Unexpected end query or fragment in Gitea/Gogs URL: {self.origin.url}" + ) + + def _api_url(self): + (scheme, netloc, path, query, fragment) = urllib.parse.urlsplit(self.origin.url) + path = urllib.parse.unquote(path) + + # Normalize it, so only ``:namespace/:project`` is left + path = path.strip("/") + if path.endswith(".git"): + path = path[0:-4] + assert path.count("/") == 1, f"Unexpected number of / in {path}" + + api_path = f"/api/v1/repos/{path}" + + return urllib.parse.urlunsplit((scheme, netloc, api_path, "", "")) + + def _get_origin_metadata_bytes(self) -> List[Tuple[str, bytes]]: + response = self.session().get(self._api_url()) + if response.status_code != 200: + # TODO: retry + return [] + + metadata_bytes = response.content + + return [(self.METADATA_FORMAT, metadata_bytes)] + + def get_parent_origins(self) -> List[Origin]: + parents = [] + for metadata in self.get_origin_metadata(): + if metadata.format != self.METADATA_FORMAT: + continue + data = json.loads(metadata.metadata) + parent = data.get("parent") + if parent is not None: + parents.append(Origin(url=parent["html_url"])) + + return parents + + +class GiteaMetadataFetcher(_BaseGiteaMetadataFetcher): + FETCHER_NAME = "gitea" + SUPPORTED_LISTERS = {"gitea"} + METADATA_FORMAT = "gitea-repository-json" + + def on_anonymous_mode(self): + logger.warning( + "No authentication token set in configuration, using anonymous mode" + ) + + +class GogsMetadataFetcher(_BaseGiteaMetadataFetcher): + FETCHER_NAME = "gogs" + SUPPORTED_LISTERS = {"gogs"} + METADATA_FORMAT = "gogs-repository-json" + + def on_anonymous_mode(self): + raise ValueError("No credentials or API token provided") diff --git a/swh/loader/metadata/tests/data/https_codeberg.org/api_v1_repos_ForgeFed_ForgeFed b/swh/loader/metadata/tests/data/https_codeberg.org/api_v1_repos_ForgeFed_ForgeFed new file mode 100644 --- /dev/null +++ b/swh/loader/metadata/tests/data/https_codeberg.org/api_v1_repos_ForgeFed_ForgeFed @@ -0,0 +1 @@ +{"id":48043,"owner":{"id":48018,"login":"ForgeFed","full_name":"","email":"","avatar_url":"https://codeberg.org/avatars/c20f7a6733a6156304137566ee35ef33","language":"","is_admin":false,"last_login":"0001-01-01T00:00:00Z","created":"2022-04-30T20:13:17+02:00","restricted":false,"active":false,"prohibit_login":false,"location":"","website":"https://forgefed.org/","description":"","visibility":"public","followers_count":0,"following_count":0,"starred_repos_count":0,"username":"ForgeFed"},"name":"ForgeFed","full_name":"ForgeFed/ForgeFed","description":"ActivityPub-based forge federation protocol specification","empty":false,"private":false,"fork":false,"template":false,"parent":null,"mirror":false,"size":3680,"language":"CSS","languages_url":"https://codeberg.org/api/v1/repos/ForgeFed/ForgeFed/languages","html_url":"https://codeberg.org/ForgeFed/ForgeFed","ssh_url":"git@codeberg.org:ForgeFed/ForgeFed.git","clone_url":"https://codeberg.org/ForgeFed/ForgeFed.git","original_url":"https://notabug.org/peers/forgefed","website":"https://forgefed.org","stars_count":30,"forks_count":6,"watchers_count":11,"open_issues_count":62,"open_pr_counter":6,"release_counter":0,"default_branch":"main","archived":false,"created_at":"2022-06-13T18:54:26+02:00","updated_at":"2022-07-12T19:40:29+02:00","permissions":{"admin":false,"push":false,"pull":true},"has_issues":true,"internal_tracker":{"enable_time_tracker":true,"allow_only_contributors_to_track_time":true,"enable_issue_dependencies":true},"has_wiki":false,"has_pull_requests":true,"has_projects":true,"ignore_whitespace_conflicts":false,"allow_merge_commits":false,"allow_rebase":false,"allow_rebase_explicit":false,"allow_squash_merge":true,"default_merge_style":"squash","avatar_url":"","internal":false,"mirror_interval":"","mirror_updated":"0001-01-01T00:00:00Z","repo_transfer":null} diff --git a/swh/loader/metadata/tests/data/https_codeberg.org/api_v1_repos__ZN3val_ForgeFed b/swh/loader/metadata/tests/data/https_codeberg.org/api_v1_repos__ZN3val_ForgeFed new file mode 100644 --- /dev/null +++ b/swh/loader/metadata/tests/data/https_codeberg.org/api_v1_repos__ZN3val_ForgeFed @@ -0,0 +1 @@ +{"id":48904,"owner":{"id":54546,"login":"_ZN3val","full_name":"Val Lorentz","email":"_zn3val@noreply.codeberg.org","avatar_url":"https://codeberg.org/avatars/93b0a7fa6a88bf6bbd18badcbda75585","language":"","is_admin":false,"last_login":"0001-01-01T00:00:00Z","created":"2022-06-21T13:37:12+02:00","restricted":false,"active":false,"prohibit_login":false,"location":"","website":"","description":"","visibility":"public","followers_count":2,"following_count":0,"starred_repos_count":0,"username":"_ZN3val"},"name":"ForgeFed","full_name":"_ZN3val/ForgeFed","description":"ActivityPub-based forge federation protocol specification","empty":false,"private":false,"fork":true,"template":false,"parent":{"id":48043,"owner":{"id":48018,"login":"ForgeFed","full_name":"","email":"","avatar_url":"https://codeberg.org/avatars/c20f7a6733a6156304137566ee35ef33","language":"","is_admin":false,"last_login":"0001-01-01T00:00:00Z","created":"2022-04-30T20:13:17+02:00","restricted":false,"active":false,"prohibit_login":false,"location":"","website":"https://forgefed.org/","description":"","visibility":"public","followers_count":0,"following_count":0,"starred_repos_count":0,"username":"ForgeFed"},"name":"ForgeFed","full_name":"ForgeFed/ForgeFed","description":"ActivityPub-based forge federation protocol specification","empty":false,"private":false,"fork":false,"template":false,"parent":null,"mirror":false,"size":3680,"language":"","languages_url":"https://codeberg.org/api/v1/repos/ForgeFed/ForgeFed/languages","html_url":"https://codeberg.org/ForgeFed/ForgeFed","ssh_url":"git@codeberg.org:ForgeFed/ForgeFed.git","clone_url":"https://codeberg.org/ForgeFed/ForgeFed.git","original_url":"https://notabug.org/peers/forgefed","website":"https://forgefed.org","stars_count":30,"forks_count":6,"watchers_count":11,"open_issues_count":62,"open_pr_counter":6,"release_counter":0,"default_branch":"main","archived":false,"created_at":"2022-06-13T18:54:26+02:00","updated_at":"2022-07-12T19:40:29+02:00","permissions":{"admin":false,"push":false,"pull":true},"has_issues":true,"internal_tracker":{"enable_time_tracker":true,"allow_only_contributors_to_track_time":true,"enable_issue_dependencies":true},"has_wiki":false,"has_pull_requests":true,"has_projects":true,"ignore_whitespace_conflicts":false,"allow_merge_commits":false,"allow_rebase":false,"allow_rebase_explicit":false,"allow_squash_merge":true,"default_merge_style":"squash","avatar_url":"","internal":false,"mirror_interval":"","mirror_updated":"0001-01-01T00:00:00Z","repo_transfer":null},"mirror":false,"size":3372,"language":"CSS","languages_url":"https://codeberg.org/api/v1/repos/_ZN3val/ForgeFed/languages","html_url":"https://codeberg.org/_ZN3val/ForgeFed","ssh_url":"git@codeberg.org:_ZN3val/ForgeFed.git","clone_url":"https://codeberg.org/_ZN3val/ForgeFed.git","original_url":"","website":"","stars_count":0,"forks_count":0,"watchers_count":1,"open_issues_count":0,"open_pr_counter":0,"release_counter":0,"default_branch":"master","archived":false,"created_at":"2022-06-23T11:35:18+02:00","updated_at":"2022-07-07T15:36:36+02:00","permissions":{"admin":false,"push":false,"pull":true},"has_issues":true,"internal_tracker":{"enable_time_tracker":true,"allow_only_contributors_to_track_time":true,"enable_issue_dependencies":true},"has_wiki":true,"has_pull_requests":true,"has_projects":true,"ignore_whitespace_conflicts":false,"allow_merge_commits":true,"allow_rebase":true,"allow_rebase_explicit":true,"allow_squash_merge":true,"default_merge_style":"merge","avatar_url":"","internal":false,"mirror_interval":"","mirror_updated":"0001-01-01T00:00:00Z","repo_transfer":null} diff --git a/swh/loader/metadata/tests/test_gitea.py b/swh/loader/metadata/tests/test_gitea.py new file mode 100644 --- /dev/null +++ b/swh/loader/metadata/tests/test_gitea.py @@ -0,0 +1,121 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import datetime +from pathlib import Path +from typing import Type + +import pkg_resources + +from swh.loader.core.metadata_fetchers import MetadataFetcherProtocol +from swh.loader.metadata import __version__ +from swh.loader.metadata.gitea import GiteaMetadataFetcher +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + Origin, + RawExtrinsicMetadata, +) + +from .test_base import DummyLoader + +ORIGIN = Origin("https://codeberg.org/ForgeFed/ForgeFed") +FORKED_ORIGIN = Origin("https://codeberg.org/_ZN3val/ForgeFed") + +METADATA_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://codeberg.org" +) + + +def expected_metadata(dt, datadir): + data_file_path = Path(datadir) / "https_codeberg.org/api_v1_repos_ForgeFed_ForgeFed" + with data_file_path.open("rb") as fd: + expected_metadata_bytes = fd.read() + return RawExtrinsicMetadata( + target=ORIGIN.swhid(), + discovery_date=dt, + authority=METADATA_AUTHORITY, + fetcher=MetadataFetcher(name="swh.loader.metadata.gitea", version=__version__), + format="gitea-repository-json", + metadata=expected_metadata_bytes, + ) + + +def test_type() -> None: + # check with mypy + fetcher_cls: Type[MetadataFetcherProtocol] + fetcher_cls = GiteaMetadataFetcher + print(fetcher_cls) + + # check at runtime + fetcher = GiteaMetadataFetcher( + ORIGIN, + credentials=None, + lister_name="gitea", + lister_instance_name="gitea", + ) + assert isinstance(fetcher, MetadataFetcherProtocol) + + +def test_gitea_metadata(datadir, requests_mock_datadir, mocker): + now = datetime.datetime.now(tz=datetime.timezone.utc) + mocker.patch("swh.loader.metadata.base.now", return_value=now) + + fetcher = GiteaMetadataFetcher( + ORIGIN, credentials=None, lister_name="gitea", lister_instance_name="gitea" + ) + + assert fetcher.get_origin_metadata()[0] == expected_metadata(now, datadir) + assert fetcher.get_origin_metadata() == [expected_metadata(now, datadir)] + assert fetcher.get_parent_origins() == [] + + # Need to make sure '/' in the project ID was encoded as %2F; as + # requests_mock_datadir does not tell the difference (because it uses urlunquote, + # which decodes it) + assert len(requests_mock_datadir.request_history) == 1 + assert ( + requests_mock_datadir.request_history[0].url + == "https://codeberg.org/api/v1/repos/ForgeFed/ForgeFed" + ) + + +def test_gitea_metadata_fork(datadir, requests_mock_datadir, mocker): + now = datetime.datetime.now(tz=datetime.timezone.utc) + mocker.patch("swh.loader.metadata.base.now", return_value=now) + + fetcher = GiteaMetadataFetcher( + FORKED_ORIGIN, + credentials=None, + lister_name="gitea", + lister_instance_name="gitea", + ) + + assert fetcher.get_parent_origins() == [ORIGIN] + + +def test_gitea_metadata_from_loader( + swh_storage, mocker, datadir, requests_mock_datadir +): + # Fail early if this package is not fully installed + assert "gitea" in { + entry_point.name + for entry_point in pkg_resources.iter_entry_points("swh.loader.metadata") + } + + now = datetime.datetime.now(tz=datetime.timezone.utc) + mocker.patch("swh.loader.metadata.base.now", return_value=now) + + loader = DummyLoader( + storage=swh_storage, + origin_url=ORIGIN.url, + lister_name="gitea", + lister_instance_name="gitea", + ) + loader.load() + + assert swh_storage.raw_extrinsic_metadata_get( + ORIGIN.swhid(), METADATA_AUTHORITY + ).results == [expected_metadata(now, datadir)]