diff --git a/swh/indexer/data/nuget.csv b/swh/indexer/data/nuget.csv new file mode 100644 --- /dev/null +++ b/swh/indexer/data/nuget.csv @@ -0,0 +1,68 @@ +Property,NuGet +codeRepository,repository.url +programmingLanguage, +runtimePlatform, +targetProduct, +applicationCategory, +applicationSubCategory, +downloadUrl, +fileSize, +installUrl, +memoryRequirements, +operatingSystem, +permissions, +processorRequirements, +releaseNotes,releaseNotes +softwareHelp, +softwareRequirements, +softwareVersion, +storageRequirements, +supportingData, +author,authors +citation, +contributor, +copyrightHolder, +copyrightYear, +dateCreated, +dateModified, +datePublished, +editor, +encoding, +fileFormat, +funder, +keywords,tags +license,license/licenseUrl +producer, +provider, +publisher, +sponsor, +version,version +isAccessibleForFree, +isPartOf, +hasPart, +position, +description,description/summary +identifier, +name,name +sameAs, +url,projectUrl +relatedLink, +givenName, +familyName, +email, +affiliation, +identifier,id +name, +address, +type, +id, +softwareSuggestions, +maintainer, +contIntegration, +buildInstructions, +developmentStatus, +embargoDate, +funding, +issueTracker, +referencePublication, +readme, diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py --- a/swh/indexer/metadata_dictionary/__init__.py +++ b/swh/indexer/metadata_dictionary/__init__.py @@ -8,7 +8,7 @@ import click -from . import cff, codemeta, composer, dart, github, maven, npm, python, ruby +from . import cff, codemeta, composer, dart, github, maven, npm, nuget, python, ruby from .base import BaseExtrinsicMapping, BaseIntrinsicMapping, BaseMapping INTRINSIC_MAPPINGS: Dict[str, Type[BaseIntrinsicMapping]] = { @@ -20,6 +20,7 @@ "PubMapping": dart.PubspecMapping, "PythonPkginfoMapping": python.PythonPkginfoMapping, "ComposerMapping": composer.ComposerMapping, + "NuGetMapping": nuget.NuGetMapping, } EXTRINSIC_MAPPINGS: Dict[str, Type[BaseExtrinsicMapping]] = { diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py new file mode 100644 --- /dev/null +++ b/swh/indexer/metadata_dictionary/nuget.py @@ -0,0 +1,109 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os.path +import re +from typing import Any, Dict, List, Optional + +import xmltodict + +from swh.indexer.codemeta import _DATA_DIR, SCHEMA_URI, _read_crosstable +from swh.indexer.storage.interface import Sha1 + +from .base import DictMapping, DirectoryLsEntry, SingleFileIntrinsicMapping + +NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv") + +with open(NUGET_TABLE_PATH) as fd: + (CODEMETA_TERMS, NUGET_TABLE) = _read_crosstable(fd) + + +class NuGetMapping(DictMapping, SingleFileIntrinsicMapping): + """ + dedicated class for NuGet (.nuspec) mapping and translation + """ + + name = "nuget" + mapping = NUGET_TABLE["NuGet"] + mapping["copyright"] = "http://schema.org/copyrightNotice" + mapping["language"] = "http://schema.org/inLanguage" + string_fields = [ + "description", + "version", + "projectUrl", + "name", + "tags", + "license", + "licenseUrl", + "summary", + "copyright", + "language", + ] + + @classmethod + def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: + for entry in file_entries: + if entry["name"].endswith(b".nuspec"): + return [entry["sha1"]] + return [] + + def translate(self, content: bytes) -> Optional[Dict[str, Any]]: + d = ( + xmltodict.parse(content.strip(b" \n ")) + .get("package", {}) + .get("metadata", {}) + ) + if not isinstance(d, dict): + self.log.warning("Skipping ill-formed XML content: %s", content) + return None + + return self._translate_dict(d) + + def normalize_projectUrl(self, s): + if isinstance(s, str): + return {"@id": s} + + def translate_repository(self, translated_metadata, v): + if isinstance(v, dict) and isinstance(v["@url"], str): + codemeta_key = self.mapping["repository.url"] + translated_metadata[codemeta_key] = {"@id": v["@url"]} + + def normalize_license(self, v): + if isinstance(v, dict) and v["@type"] == "expression": + license_string = v["#text"] + if not bool( + re.search(r" with |\(|\)| and ", license_string, re.IGNORECASE) + ): + return [ + {"@id": "https://spdx.org/licenses/" + license_type.strip()} + for license_type in re.split( + r" or ", license_string, flags=re.IGNORECASE + ) + ] + else: + return None + + def normalize_licenseUrl(self, s): + if isinstance(s, str): + return {"@id": s} + + def normalize_authors(self, s): + if isinstance(s, str): + author_names = [a.strip() for a in s.split(",")] + authors = [ + {"@type": SCHEMA_URI + "Person", SCHEMA_URI + "name": name} + for name in author_names + ] + return {"@list": authors} + + def translate_releaseNotes(self, translated_metadata, s): + if isinstance(s, str): + translated_metadata.setdefault("http://schema.org/releaseNotes", []).append( + s + ) + + def normalize_tags(self, s): + if isinstance(s, str): + return s.split(" ") diff --git a/swh/indexer/tests/metadata_dictionary/test_nuget.py b/swh/indexer/tests/metadata_dictionary/test_nuget.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/metadata_dictionary/test_nuget.py @@ -0,0 +1,171 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.indexer.metadata_detector import detect_metadata +from swh.indexer.metadata_dictionary import MAPPINGS + + +def test_compute_metadata_nuget(): + raw_content = b""" + + + + sample + 1.2.3 + Kim Abercrombie, Franck Halmaert + Sample exists only to show a sample .nuspec file. + Summary is being deprecated. Use description instead. + http://example.org/ + + MIT + https://raw.github.com/timrwood/moment/master/LICENSE + + + + + + See the [changelog](https://github.com/httpie/httpie/releases/tag/3.2.0). + + python3 java cpp search-tag + + + + + """ + result = MAPPINGS["NuGetMapping"]().translate(raw_content) + expected = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [ + {"type": "Person", "name": "Kim Abercrombie"}, + {"type": "Person", "name": "Franck Halmaert"}, + ], + "codeRepository": "https://github.com/NuGet/NuGet.Client.git", + "description": [ + "Sample exists only to show a sample .nuspec file.", + "Summary is being deprecated. Use description instead.", + ], + "license": [ + "https://spdx.org/licenses/MIT", + "https://raw.github.com/timrwood/moment/master/LICENSE", + ], + "url": "http://example.org/", + "version": "1.2.3", + "schema:releaseNotes": ( + "See the [changelog](https://github.com/httpie/httpie/releases/tag/3.2.0)." + ), + "keywords": [ + "python3", + "java", + "cpp", + "search-tag", + ], + } + + assert result == expected + + +@pytest.mark.parametrize( + "filename", + [b"package_name.nuspec", b"number_5.nuspec", b"CAPS.nuspec", b"\x8anan.nuspec"], +) +def test_detect_metadata_package_nuspec(filename): + df = [ + { + "sha1_git": b"abc", + "name": b"example.json", + "target": b"abc", + "length": 897, + "status": "visible", + "type": "file", + "perms": 33188, + "dir_id": b"dir_a", + "sha1": b"bcd", + }, + { + "sha1_git": b"aab", + "name": filename, + "target": b"aab", + "length": 712, + "status": "visible", + "type": "file", + "perms": 33188, + "dir_id": b"dir_a", + "sha1": b"cde", + }, + ] + results = detect_metadata(df) + + expected_results = {"NuGetMapping": [b"cde"]} + assert expected_results == results + + +def test_normalize_license_multiple_licenses_or_delimiter(): + raw_content = raw_content = b""" + + + + BitTorrent-1.0 or GPL-3.0-with-GCC-exception + + + + + """ + result = MAPPINGS["NuGetMapping"]().translate(raw_content) + expected = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "license": [ + "https://spdx.org/licenses/BitTorrent-1.0", + "https://spdx.org/licenses/GPL-3.0-with-GCC-exception", + ], + } + + assert result == expected + + +def test_normalize_license_unsupported_delimiter(): + raw_content = raw_content = b""" + + + + (MIT) + + + + + """ + result = MAPPINGS["NuGetMapping"]().translate(raw_content) + expected = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + } + + assert result == expected + + +def test_copyrightNotice_absolute_uri_property(): + raw_content = raw_content = b""" + + + + Copyright 2017-2022 + en-us + + + + + """ + result = MAPPINGS["NuGetMapping"]().translate(raw_content) + expected = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "schema:copyrightNotice": "Copyright 2017-2022", + "schema:inLanguage": "en-us", + } + + assert result == expected diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -113,6 +113,7 @@ "github", "maven", "npm", + "nuget", "pkg-info", "pubspec", "",