diff --git a/swh/foo/bar.py b/conftest.py rename from swh/foo/bar.py rename to conftest.py --- a/swh/foo/bar.py +++ b/conftest.py @@ -1,4 +1,9 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + +pytest_plugins = [ + "swh.storage.pytest_plugin", + "swh.loader.pytest_plugin", +] diff --git a/docs/README.rst b/docs/README.rst --- a/docs/README.rst +++ b/docs/README.rst @@ -1,4 +1,5 @@ -Software Heritage - Python module template -========================================== +Software Heritage - Metadata Fetchers +===================================== -Python module template, used as skeleton to create new modules. +This package hooks into the loaders, and loads extrinsic metadata at the same time +code artifacts are retrieved. diff --git a/docs/index.rst b/docs/index.rst --- a/docs/index.rst +++ b/docs/index.rst @@ -1,4 +1,4 @@ -.. _swh-py-template: +.. _swh-loader-metadata: .. include:: README.rst diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -5,6 +5,9 @@ # 3rd party libraries without stubs (yet) +[mypy-iso8601.*] +ignore_missing_imports = True + [mypy-pkg_resources.*] ignore_missing_imports = True diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,4 @@ -# Add here internal Software Heritage dependencies, one per line. swh.core[http] >= 0.3 # [http] is required by swh.core.pytest_plugin +swh.lister >= 2.9.0 +swh.loader.core >= 3.1.0 +swh.storage >= 0.29.0 diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1 +1,7 @@ pytest +pytest-mock +requests_mock +types-requests +types-PyYAML + +swh-storage[testing] diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -36,18 +36,15 @@ return requirements -# Edit this part to match your module, replace foo by its name -# Full sample: -# https://forge.softwareheritage.org/diffusion/DCORE/browse/master/setup.py setup( - name="swh.foo", # example: swh.loader.pypi - description="Software Heritage ", + name="swh.loader.metadata", + description="Software Heritage Extrinsic Metadata Fetchers", long_description=long_description, long_description_content_type="text/x-rst", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", - url="https://forge.softwareheritage.org/diffusion/", + url="https://forge.softwareheritage.org/diffusion/swh-loader-metadata", packages=find_packages(), # packages's modules install_requires=parse_requirements(None, "swh"), tests_require=parse_requirements("test"), @@ -56,8 +53,8 @@ extras_require={"testing": parse_requirements("test")}, include_package_data=True, entry_points=""" - [swh.cli.subcommands] - foo=swh.foo.cli + [swh.loader.metadata] + github=swh.loader.metadata.github:GitHubMetadataFetcher """, classifiers=[ "Programming Language :: Python :: 3", @@ -69,7 +66,7 @@ project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", - "Source": "https://forge.softwareheritage.org/source/swh-", - "Documentation": "https://docs.softwareheritage.org/devel/swh-/", + "Source": "https://forge.softwareheritage.org/source/swh-loader-metadata", + "Documentation": "https://docs.softwareheritage.org/devel/swh-loader-metadata/", }, ) diff --git a/swh/foo/cli.py b/swh/foo/cli.py deleted file mode 100644 --- a/swh/foo/cli.py +++ /dev/null @@ -1,18 +0,0 @@ -import click - -from swh.core.cli import CONTEXT_SETTINGS -from swh.core.cli import swh as swh_cli_group - - -@swh_cli_group.group(name="foo", context_settings=CONTEXT_SETTINGS) -@click.pass_context -def foo_cli_group(ctx): - """Foo main command.""" - - -@foo_cli_group.command() -@click.option("--bar", help="Something") -@click.pass_context -def bar(ctx, bar): - """Do something.""" - click.echo("bar") diff --git a/swh/foo/tests/__init__.py b/swh/foo/tests/__init__.py deleted file mode 100644 diff --git a/swh/foo/tests/test_nothing.py b/swh/foo/tests/test_nothing.py deleted file mode 100644 --- a/swh/foo/tests/test_nothing.py +++ /dev/null @@ -1,3 +0,0 @@ -def test_nothing(): - # Placeholder; remove this when we add actual tests - pass diff --git a/swh/loader/__init__.py b/swh/loader/__init__.py new file mode 100644 --- /dev/null +++ b/swh/loader/__init__.py @@ -0,0 +1,3 @@ +from pkgutil import extend_path + +__path__ = extend_path(__path__, __name__) diff --git a/swh/loader/metadata/__init__.py b/swh/loader/metadata/__init__.py new file mode 100644 --- /dev/null +++ b/swh/loader/metadata/__init__.py @@ -0,0 +1,15 @@ +# Copyright (C) 2019-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pkg_resources + +try: + __version__ = pkg_resources.get_distribution("swh.loader.metadata").version +except pkg_resources.DistributionNotFound: + __version__ = "devel" + + +USER_AGENT_TEMPLATE = "Software Heritage Metadata Loader (%s)" +USER_AGENT = USER_AGENT_TEMPLATE % __version__ diff --git a/swh/foo/bar.py b/swh/loader/metadata/bar.py rename from swh/foo/bar.py rename to swh/loader/metadata/bar.py diff --git a/swh/loader/metadata/base.py b/swh/loader/metadata/base.py new file mode 100644 --- /dev/null +++ b/swh/loader/metadata/base.py @@ -0,0 +1,184 @@ +# Copyright (C) 2020-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""Base module for all metadata fetchers, which are called by the Git loader +to get metadata from forges on origins being loaded.""" + +import datetime +import sys +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Type +import urllib.parse + +import requests + +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + Origin, + RawExtrinsicMetadata, +) + +CredentialsType = Optional[Dict[str, Dict[str, List[Dict[str, str]]]]] + + +class InvalidOrigin(Exception): + pass + + +def now() -> datetime.datetime: + # Used by tests for mocking + return datetime.datetime.now(tz=datetime.timezone.utc) + + +class BaseMetadataFetcher: + """The base class for a Software Heritage metadata fetchers + + Fetchers are hooks used by loader to retrieve extrinsic metadata from + forges before archiving repositories. + + Each fetcher handles a specific type of forge (not VCS); each fetcher + class generally matches a lister class, as they use the same APIs. + + Args: + origin: the origin to retrieve metadata from + credentials: This is the same format as for :class:`swh.lister.pattern.Lister`: + dictionary of credentials for all fetchers. The first level + identifies the fetcher's name, the second level the lister + instance. The final level is a list of dicts containing the + expected credentials for the given instance of that fetcher. + session: optional HTTP session to use to send HTTP requests + """ + + FETCHER_NAME: str + """The config-friendly name of this fetcher, used to retrieve the first + level of credentials.""" + + SUPPORTED_LISTERS: Set[str] + """Set of forge types this metadata fetcher supports. The type names are the same + as the names used by listers themselves. + + Generally, fetchers have a one-to-one matching with listers, in which case + this is set of ``{FETCHER_NAME}``. + """ + + def __init__( + self, + origin: Origin, + credentials: CredentialsType, + lister_name: str, + lister_instance_name: str, + ): + if self.FETCHER_NAME is None: + raise NotImplementedError(f"{self.__class__.__name__}.FETCHER_NAME") + self.origin = origin + self._check_origin() + self._origin_metadata_objects: Optional[List[RawExtrinsicMetadata]] = None + self._session: Optional[requests.Session] = None + + # Both names do not *have* to match, but they all do for now. + assert lister_name == self.FETCHER_NAME + + self.credentials = list( + (credentials or {}).get(lister_name, {}).get(lister_instance_name, []) + ) + + def _make_session(self) -> requests.Session: + session = requests.Session() + fetcher = self._metadata_fetcher() + user_agent = ( + f"Software Heritage Metadata Fetcher ({fetcher.name} {fetcher.version})" + ) + session.headers["User-Agent"] = user_agent + return session + + def session(self) -> requests.Session: + if self._session is None: + self._session = self._make_session() + return self._session + + def _check_origin(self) -> bool: + """Raise :exc:`InvalidOrigin` if the origin does not belong to the supported + forge types of this fetcher.""" + raise NotImplementedError(f"{self.__class__.__name__}._check_origin") + + def _get_origin_metadata_bytes(self) -> List[Tuple[str, bytes]]: + """Return pairs of ``(format, metadata)``, used to build + :class:`swh.model.model.RawExtrinsicMetadata` objects.""" + raise NotImplementedError( + f"{self.__class__.__name__}.get_origin_metadata_bytes" + ) + + def _metadata_authority(self) -> MetadataAuthority: + """Return information about the metadata authority that issued metadata + we extract from the given origin""" + (scheme, netloc, *_) = urllib.parse.urlsplit(self.origin.url) + + assert scheme and netloc, self.origin.url + + # A good default that should work for most, if not all, forges + forge_url = urllib.parse.urlunsplit(("https", netloc, "", "", "")) + return MetadataAuthority( + url=forge_url, + type=MetadataAuthorityType.FORGE, + ) + + @classmethod + def _get_package_version(cls) -> str: + """Return the version of the current loader.""" + module_name = cls.__module__ or "" + module_name_parts = module_name.split(".") + + # Iterate rootward through the package hierarchy until we find a parent of this + # loader's module with a __version__ attribute. + for prefix_size in range(len(module_name_parts), 0, -1): + package_name = ".".join(module_name_parts[0:prefix_size]) + module = sys.modules[package_name] + if hasattr(module, "__version__"): + return module.__version__ + + # If this fetcher's class has no parent package with a __version__, + # it should implement it itself. + raise NotImplementedError( + f"Could not dynamically find the version of {module_name}." + ) + + @classmethod + def _metadata_fetcher(cls) -> MetadataFetcher: + """Return information about this metadata fetcher""" + return MetadataFetcher( + name=cls.__module__, + version=cls._get_package_version(), + ) + + def get_origin_metadata(self) -> List[RawExtrinsicMetadata]: + """Return a list of metadata objects for the given origin.""" + if self._origin_metadata_objects is None: + self._origin_metadata_objects = [] + for (format_, metadata_bytes) in self._get_origin_metadata_bytes(): + self._origin_metadata_objects.append( + RawExtrinsicMetadata( + target=self.origin.swhid(), + discovery_date=now(), + authority=self._metadata_authority(), + fetcher=self._metadata_fetcher(), + format=format_, + metadata=metadata_bytes, + ) + ) + + return self._origin_metadata_objects + + +if TYPE_CHECKING: + # Makes mypy check BaseMetadataFetcher follows the MetadataFetcherProtocol + def _f() -> None: + from swh.loader.core.metadata_fetchers import MetadataFetcherProtocol + + base_metadata_fetcher: Type[MetadataFetcherProtocol] + base_metadata_fetcher = BaseMetadataFetcher + print(base_metadata_fetcher) + + del _f diff --git a/swh/loader/metadata/github.py b/swh/loader/metadata/github.py new file mode 100644 --- /dev/null +++ b/swh/loader/metadata/github.py @@ -0,0 +1,69 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""Metadata fetcher for GitHub.""" + +import re +from typing import List, Optional, Tuple +import urllib.parse + +from swh.lister.github.utils import GitHubSession + +from . import USER_AGENT +from .base import BaseMetadataFetcher, InvalidOrigin + +HTTP_ACCEPT = "application/vnd.github.v3+json" +"""HTTP header sent on all API requests to GitHub.""" + +# The format is defined by a well-understood MIME type; we might as well use that. +METADATA_FORMAT = HTTP_ACCEPT +"""Value of the ``format`` field of produced +:class:`swh.model.model.RawExtrinsicMetadata` objects.""" + +_API_URL = "https://api.github.com/repos{path}" + + +class GitHubMetadataFetcher(BaseMetadataFetcher): + FETCHER_NAME = "github" + SUPPORTED_LISTERS = {"github"} + + _github_session: Optional[GitHubSession] = None + + def github_session(self) -> GitHubSession: + if self._github_session is None: + self._github_session = GitHubSession( + user_agent=USER_AGENT, credentials=self.credentials + ) + return self._github_session + + def _check_origin(self): + (scheme, netloc, path, query, fragment) = urllib.parse.urlsplit(self.origin.url) + if netloc != "github.com": + # TODO: relax this check when we support self-hosted GitHub instances + raise InvalidOrigin(f"netloc should be 'github.com', not '{netloc}'") + + if scheme != "https" or not re.match(r"/[^\s/]+/[^\s/]+", path): + raise InvalidOrigin(f"Unsupported github.com URL: {self.origin.url}") + + if query != "" or fragment != "": + raise InvalidOrigin( + f"Unexpected end query or fragment in github.com URL: {self.origin.url}" + ) + + def _get_origin_metadata_bytes(self) -> List[Tuple[str, bytes]]: + (scheme, netloc, path, query, fragment) = urllib.parse.urlsplit(self.origin.url) + response = self.github_session().request(_API_URL.format(path=path)) + if response.status_code != 200: + # TODO: retry + return [] + + metadata_bytes = response.content + + # TODO?: strip API hyperlinks from metadata_bytes to save space? + # They take 10KB for every repo, or 1KB when compressed by the database server. + # This means processing metadata_bytes and changing the format, instead of + # archiving verbatim, though. + + return [(METADATA_FORMAT, metadata_bytes)] diff --git a/swh/foo/py.typed b/swh/loader/metadata/py.typed rename from swh/foo/py.typed rename to swh/loader/metadata/py.typed diff --git a/swh/foo/__init__.py b/swh/loader/metadata/tests/__init__.py rename from swh/foo/__init__.py rename to swh/loader/metadata/tests/__init__.py diff --git a/swh/loader/metadata/tests/data/https_api.github.com/repos_octocat_Hello-World b/swh/loader/metadata/tests/data/https_api.github.com/repos_octocat_Hello-World new file mode 100644 --- /dev/null +++ b/swh/loader/metadata/tests/data/https_api.github.com/repos_octocat_Hello-World @@ -0,0 +1,103 @@ +{ + "id": 1296269, + "node_id": "MDEwOlJlcG9zaXRvcnkxMjk2MjY5", + "name": "Hello-World", + "full_name": "octocat/Hello-World", + "private": false, + "owner": { + "login": "octocat", + "id": 583231, + "node_id": "MDQ6VXNlcjU4MzIzMQ==", + "avatar_url": "https://avatars.githubusercontent.com/u/583231?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/octocat", + "html_url": "https://github.com/octocat", + "followers_url": "https://api.github.com/users/octocat/followers", + "following_url": "https://api.github.com/users/octocat/following{/other_user}", + "gists_url": "https://api.github.com/users/octocat/gists{/gist_id}", + "starred_url": "https://api.github.com/users/octocat/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/octocat/subscriptions", + "organizations_url": "https://api.github.com/users/octocat/orgs", + "repos_url": "https://api.github.com/users/octocat/repos", + "events_url": "https://api.github.com/users/octocat/events{/privacy}", + "received_events_url": "https://api.github.com/users/octocat/received_events", + "type": "User", + "site_admin": false + }, + "html_url": "https://github.com/octocat/Hello-World", + "description": "My first repository on GitHub!", + "fork": false, + "url": "https://api.github.com/repos/octocat/Hello-World", + "forks_url": "https://api.github.com/repos/octocat/Hello-World/forks", + "keys_url": "https://api.github.com/repos/octocat/Hello-World/keys{/key_id}", + "collaborators_url": "https://api.github.com/repos/octocat/Hello-World/collaborators{/collaborator}", + "teams_url": "https://api.github.com/repos/octocat/Hello-World/teams", + "hooks_url": "https://api.github.com/repos/octocat/Hello-World/hooks", + "issue_events_url": "https://api.github.com/repos/octocat/Hello-World/issues/events{/number}", + "events_url": "https://api.github.com/repos/octocat/Hello-World/events", + "assignees_url": "https://api.github.com/repos/octocat/Hello-World/assignees{/user}", + "branches_url": "https://api.github.com/repos/octocat/Hello-World/branches{/branch}", + "tags_url": "https://api.github.com/repos/octocat/Hello-World/tags", + "blobs_url": "https://api.github.com/repos/octocat/Hello-World/git/blobs{/sha}", + "git_tags_url": "https://api.github.com/repos/octocat/Hello-World/git/tags{/sha}", + "git_refs_url": "https://api.github.com/repos/octocat/Hello-World/git/refs{/sha}", + "trees_url": "https://api.github.com/repos/octocat/Hello-World/git/trees{/sha}", + "statuses_url": "https://api.github.com/repos/octocat/Hello-World/statuses/{sha}", + "languages_url": "https://api.github.com/repos/octocat/Hello-World/languages", + "stargazers_url": "https://api.github.com/repos/octocat/Hello-World/stargazers", + "contributors_url": "https://api.github.com/repos/octocat/Hello-World/contributors", + "subscribers_url": "https://api.github.com/repos/octocat/Hello-World/subscribers", + "subscription_url": "https://api.github.com/repos/octocat/Hello-World/subscription", + "commits_url": "https://api.github.com/repos/octocat/Hello-World/commits{/sha}", + "git_commits_url": "https://api.github.com/repos/octocat/Hello-World/git/commits{/sha}", + "comments_url": "https://api.github.com/repos/octocat/Hello-World/comments{/number}", + "issue_comment_url": "https://api.github.com/repos/octocat/Hello-World/issues/comments{/number}", + "contents_url": "https://api.github.com/repos/octocat/Hello-World/contents/{+path}", + "compare_url": "https://api.github.com/repos/octocat/Hello-World/compare/{base}...{head}", + "merges_url": "https://api.github.com/repos/octocat/Hello-World/merges", + "archive_url": "https://api.github.com/repos/octocat/Hello-World/{archive_format}{/ref}", + "downloads_url": "https://api.github.com/repos/octocat/Hello-World/downloads", + "issues_url": "https://api.github.com/repos/octocat/Hello-World/issues{/number}", + "pulls_url": "https://api.github.com/repos/octocat/Hello-World/pulls{/number}", + "milestones_url": "https://api.github.com/repos/octocat/Hello-World/milestones{/number}", + "notifications_url": "https://api.github.com/repos/octocat/Hello-World/notifications{?since,all,participating}", + "labels_url": "https://api.github.com/repos/octocat/Hello-World/labels{/name}", + "releases_url": "https://api.github.com/repos/octocat/Hello-World/releases{/id}", + "deployments_url": "https://api.github.com/repos/octocat/Hello-World/deployments", + "created_at": "2011-01-26T19:01:12Z", + "updated_at": "2022-04-21T05:34:00Z", + "pushed_at": "2022-04-15T15:39:28Z", + "git_url": "git://github.com/octocat/Hello-World.git", + "ssh_url": "git@github.com:octocat/Hello-World.git", + "clone_url": "https://github.com/octocat/Hello-World.git", + "svn_url": "https://github.com/octocat/Hello-World", + "homepage": "", + "size": 1, + "stargazers_count": 1844, + "watchers_count": 1844, + "language": null, + "has_issues": true, + "has_projects": true, + "has_downloads": true, + "has_wiki": true, + "has_pages": false, + "forks_count": 1729, + "mirror_url": null, + "archived": false, + "disabled": false, + "open_issues_count": 867, + "license": null, + "allow_forking": true, + "is_template": false, + "topics": [ + + ], + "visibility": "public", + "forks": 1729, + "open_issues": 867, + "watchers": 1844, + "default_branch": "master", + "temp_clone_token": null, + "network_count": 1729, + "subscribers_count": 1724 +} diff --git a/swh/loader/metadata/tests/test_base.py b/swh/loader/metadata/tests/test_base.py new file mode 100644 --- /dev/null +++ b/swh/loader/metadata/tests/test_base.py @@ -0,0 +1,108 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import datetime + +import pytest + +from swh.loader.core.loader import BaseLoader +from swh.loader.metadata.base import BaseMetadataFetcher +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + Origin, + RawExtrinsicMetadata, +) +import swh.storage.exc + +ORIGIN = Origin(url="some-url") + +METADATA_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="http://example.org/" +) +REMD = RawExtrinsicMetadata( + target=ORIGIN.swhid(), + discovery_date=datetime.datetime.now(tz=datetime.timezone.utc), + authority=METADATA_AUTHORITY, + fetcher=MetadataFetcher( + name="test fetcher", + version="0.0.1", + ), + format="test-format", + metadata=b'{"foo": "bar"}', +) + + +class DummyLoader(BaseLoader): + """Base Loader to overload and simplify the base class (technical: to avoid repetition + in other *Loader classes)""" + + visit_type = "git" + + def __init__(self, storage, *args, **kwargs): + super().__init__(storage, *args, **kwargs) + + def cleanup(self): + pass + + def prepare(self, *args, **kwargs): + pass + + def fetch_data(self): + pass + + def get_snapshot_id(self): + return None + + def store_data(self): + pass + + +class DummyMetadataFetcher(BaseMetadataFetcher): + SUPPORTED_LISTERS = {"fake-lister"} + + def __init__(self, origin, credentials, lister_name, lister_instance_name): + pass + + def get_origin_metadata(self): + return [REMD] + + +def test_load(swh_storage, mocker): + mocker.patch( + "swh.loader.core.metadata_fetchers._fetchers", + return_value=[DummyMetadataFetcher], + ) + + loader = DummyLoader( + storage=swh_storage, + origin_url=ORIGIN.url, + lister_name="fake-lister", + lister_instance_name="", + ) + loader.load() + + assert swh_storage.raw_extrinsic_metadata_get( + ORIGIN.swhid(), METADATA_AUTHORITY + ).results == [REMD] + + +def test_load_unknown_lister(swh_storage, mocker): + mocker.patch( + "swh.loader.core.metadata_fetchers._fetchers", + return_value=[DummyMetadataFetcher], + ) + + loader = DummyLoader( + storage=swh_storage, + origin_url=ORIGIN.url, + lister_name="other-lister", + lister_instance_name="", + ) + loader.load() + + with pytest.raises(swh.storage.exc.StorageArgumentException): + swh_storage.raw_extrinsic_metadata_get(ORIGIN.swhid(), METADATA_AUTHORITY) diff --git a/swh/loader/metadata/tests/test_github.py b/swh/loader/metadata/tests/test_github.py new file mode 100644 --- /dev/null +++ b/swh/loader/metadata/tests/test_github.py @@ -0,0 +1,95 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import datetime +from pathlib import Path +from typing import Type + +import pkg_resources + +from swh.loader.core.metadata_fetchers import MetadataFetcherProtocol +from swh.loader.metadata import __version__ +from swh.loader.metadata.github import GitHubMetadataFetcher +from swh.model.model import ( + MetadataAuthority, + MetadataAuthorityType, + MetadataFetcher, + Origin, + RawExtrinsicMetadata, +) + +from .test_base import DummyLoader + +ORIGIN = Origin("https://github.com/octocat/Hello-World") + +METADATA_AUTHORITY = MetadataAuthority( + type=MetadataAuthorityType.FORGE, url="https://github.com" +) + + +def expected_metadata(dt, datadir): + data_file_path = Path(datadir) / "https_api.github.com/repos_octocat_Hello-World" + with data_file_path.open("rb") as fd: + expected_metadata_bytes = fd.read() + return RawExtrinsicMetadata( + target=ORIGIN.swhid(), + discovery_date=dt, + authority=METADATA_AUTHORITY, + fetcher=MetadataFetcher(name="swh.loader.metadata.github", version=__version__), + format="application/vnd.github.v3+json", + metadata=expected_metadata_bytes, + ) + + +def test_type() -> None: + # check with mypy + fetcher_cls: Type[MetadataFetcherProtocol] + fetcher_cls = GitHubMetadataFetcher + print(fetcher_cls) + + # check at runtime + fetcher = GitHubMetadataFetcher( + ORIGIN, + credentials=None, + lister_name="github", + lister_instance_name="", + ) + assert isinstance(fetcher, MetadataFetcherProtocol) + + +def test_github_metadata(datadir, requests_mock_datadir, mocker): + now = datetime.datetime.now(tz=datetime.timezone.utc) + mocker.patch("swh.loader.metadata.base.now", return_value=now) + + fetcher = GitHubMetadataFetcher( + ORIGIN, credentials=None, lister_name="github", lister_instance_name="" + ) + + assert fetcher.get_origin_metadata() == [expected_metadata(now, datadir)] + + +def test_github_metadata_from_loader( + swh_storage, mocker, datadir, requests_mock_datadir +): + # Fail early if this package is not fully installed + assert "github" in { + entry_point.name + for entry_point in pkg_resources.iter_entry_points("swh.loader.metadata") + } + + now = datetime.datetime.now(tz=datetime.timezone.utc) + mocker.patch("swh.loader.metadata.base.now", return_value=now) + + loader = DummyLoader( + storage=swh_storage, + origin_url=ORIGIN.url, + lister_name="github", + lister_instance_name="", + ) + loader.load() + + assert swh_storage.raw_extrinsic_metadata_get( + ORIGIN.swhid(), METADATA_AUTHORITY + ).results == [expected_metadata(now, datadir)] diff --git a/tox.ini b/tox.ini --- a/tox.ini +++ b/tox.ini @@ -6,10 +6,11 @@ testing deps = pytest-cov + swh.storage[testing] commands = pytest --doctest-modules \ - {envsitepackagesdir}/swh/foo \ - --cov={envsitepackagesdir}/swh/foo \ + {envsitepackagesdir}/swh/loader/metadata \ + --cov={envsitepackagesdir}/swh/loader/metadata \ --cov-branch {posargs} [testenv:black] @@ -31,7 +32,7 @@ extras = testing deps = - mypy==0.920 + mypy==0.942 commands = mypy swh