diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py --- a/swh/indexer/metadata_dictionary/__init__.py +++ b/swh/indexer/metadata_dictionary/__init__.py @@ -1,16 +1,22 @@ +# Copyright (C) 2017-2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + import collections import click -from . import cff, codemeta, maven, npm, python, ruby +from . import cff, codemeta, github, maven, npm, python, ruby MAPPINGS = { + "CffMapping": cff.CffMapping, "CodemetaMapping": codemeta.CodemetaMapping, + "GemspecMapping": ruby.GemspecMapping, + "GitHubMapping": github.GitHubMapping, "MavenMapping": maven.MavenMapping, "NpmMapping": npm.NpmMapping, "PythonPkginfoMapping": python.PythonPkginfoMapping, - "GemspecMapping": ruby.GemspecMapping, - "CffMapping": cff.CffMapping, } diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -1,11 +1,11 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple from typing_extensions import TypedDict @@ -48,6 +48,14 @@ """ raise NotImplementedError(f"{cls.__name__}.detect_metadata_files") + @classmethod + def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: + """ + Returns the list of extrinsic metadata formats which can be translated + by this mapping + """ + raise NotImplementedError(f"{cls.__name__}.extrinsic_metadata_formats") + def translate(self, file_content: bytes) -> Optional[Dict]: raise NotImplementedError(f"{self.__class__.__name__}.translate") @@ -56,7 +64,7 @@ class SingleFileMapping(BaseMapping): - """Base class for all mappings that use a single file as input.""" + """Base class for all intrinsic metadata mappings that use a single file as input.""" @property def filename(self): @@ -70,6 +78,11 @@ return [entry["sha1"]] return [] + @classmethod + def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: + # this class is only used by intrinsic metadata mappings + return () + class DictMapping(BaseMapping): """Base class for mappings that take as input a file that is mostly @@ -94,8 +107,8 @@ term for (key, term) in cls.mapping.items() if key in cls.string_fields - or hasattr(cls, "translate_" + cls._normalize_method_name(key)) or hasattr(cls, "normalize_" + cls._normalize_method_name(key)) + or hasattr(cls, "translate_" + cls._normalize_method_name(key)) } def _translate_dict( @@ -147,6 +160,7 @@ ) else: translated_metadata[codemeta_key] = v + if normalize: return self.normalize_translation(translated_metadata) else: diff --git a/swh/indexer/metadata_dictionary/github.py b/swh/indexer/metadata_dictionary/github.py new file mode 100644 --- /dev/null +++ b/swh/indexer/metadata_dictionary/github.py @@ -0,0 +1,41 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import json +from typing import List, Tuple + +from swh.indexer.codemeta import SCHEMA_URI +from swh.indexer.storage.interface import Sha1 + +from .base import DirectoryLsEntry, JsonMapping + + +def _prettyprint(d): + print(json.dumps(d, indent=4)) + + +class GitHubMapping(JsonMapping): + name = "github" + mapping = { + "name": SCHEMA_URI + "name", + "license": SCHEMA_URI + "license", + } + string_fields = ["name"] + + @classmethod + def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: + return [] + + @classmethod + def extrinsic_metadata_formats(cls) -> Tuple[str, ...]: + return ("application/vnd.github.v3+json",) + + def normalize_license(self, d): + """ + + >>> GitHubMapping().normalize_license({'spdx_id': 'MIT'}) + {'@id': 'https://spdx.org/licenses/MIT'} + """ + if isinstance(d, dict) and isinstance(d.get("spdx_id"), str): + return {"@id": "https://spdx.org/licenses/" + d["spdx_id"]} diff --git a/swh/indexer/tests/metadata_dictionary/test_github.py b/swh/indexer/tests/metadata_dictionary/test_github.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/metadata_dictionary/test_github.py @@ -0,0 +1,113 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.indexer.metadata_dictionary import MAPPINGS + + +def test_compute_metadata_none(): + """ + testing content empty content is empty + should return None + """ + content = b"" + + # None if no metadata was found or an error occurred + declared_metadata = None + result = MAPPINGS["GitHubMapping"]().translate(content) + assert declared_metadata == result + + +def test_supported_terms(): + terms = MAPPINGS["GitHubMapping"].supported_terms() + assert {"http://schema.org/name", "http://schema.org/license"} <= terms + + +def test_compute_metadata_github(): + """ + testing only computation of metadata with hard_mapping_npm + """ + content = b""" +{ + "id": 80521091, + "node_id": "MDEwOlJlcG9zaXRvcnk4MDUyMTA5MQ==", + "name": "swh-indexer", + "full_name": "SoftwareHeritage/swh-indexer", + "private": false, + "owner": { + "login": "SoftwareHeritage", + "id": 18555939, + "node_id": "MDEyOk9yZ2FuaXphdGlvbjE4NTU1OTM5", + "avatar_url": "https://avatars.githubusercontent.com/u/18555939?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/SoftwareHeritage", + "type": "Organization", + "site_admin": false + }, + "html_url": "https://github.com/SoftwareHeritage/swh-indexer", + "description": "GitHub mirror of Metadata indexer", + "fork": false, + "url": "https://api.github.com/repos/SoftwareHeritage/swh-indexer", + "created_at": "2017-01-31T13:05:39Z", + "updated_at": "2022-06-22T08:02:20Z", + "pushed_at": "2022-06-29T09:01:08Z", + "git_url": "git://github.com/SoftwareHeritage/swh-indexer.git", + "ssh_url": "git@github.com:SoftwareHeritage/swh-indexer.git", + "clone_url": "https://github.com/SoftwareHeritage/swh-indexer.git", + "svn_url": "https://github.com/SoftwareHeritage/swh-indexer", + "homepage": "https://forge.softwareheritage.org/source/swh-indexer/", + "size": 2713, + "stargazers_count": 13, + "watchers_count": 13, + "language": "Python", + "has_issues": false, + "has_projects": false, + "has_downloads": true, + "has_wiki": false, + "has_pages": false, + "forks_count": 1, + "mirror_url": null, + "archived": false, + "disabled": false, + "open_issues_count": 0, + "license": { + "key": "gpl-3.0", + "name": "GNU General Public License v3.0", + "spdx_id": "GPL-3.0", + "url": "https://api.github.com/licenses/gpl-3.0", + "node_id": "MDc6TGljZW5zZTk=" + }, + "allow_forking": true, + "is_template": false, + "web_commit_signoff_required": false, + "topics": [ + + ], + "visibility": "public", + "forks": 1, + "open_issues": 0, + "watchers": 13, + "default_branch": "master", + "temp_clone_token": null, + "organization": { + "login": "SoftwareHeritage", + "id": 18555939, + "node_id": "MDEyOk9yZ2FuaXphdGlvbjE4NTU1OTM5", + "avatar_url": "https://avatars.githubusercontent.com/u/18555939?v=4", + "gravatar_id": "", + "type": "Organization", + "site_admin": false + }, + "network_count": 1, + "subscribers_count": 6 +} + + """ + result = MAPPINGS["GitHubMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "license": "https://spdx.org/licenses/GPL-3.0", + "name": "swh-indexer", + }