diff --git a/swh/indexer/data/nuget.csv b/swh/indexer/data/nuget.csv new file mode 100644 --- /dev/null +++ b/swh/indexer/data/nuget.csv @@ -0,0 +1,67 @@ +Property,NuGet +codeRepository,repository.url +programmingLanguage, +runtimePlatform, +targetProduct, +applicationCategory, +applicationSubCategory, +downloadUrl, +fileSize, +installUrl, +memoryRequirements, +operatingSystem, +permissions, +processorRequirements, +releaseNotes, +softwareHelp, +softwareRequirements, +softwareVersion, +storageRequirements, +supportingData, +author,authors +citation, +contributor, +copyrightHolder, +copyrightYear, +dateCreated, +dateModified, +datePublished, +editor, +encoding, +fileFormat, +funder, +keywords, +license,license +producer, +provider, +publisher, +sponsor, +version,version +isAccessibleForFree, +isPartOf, +hasPart, +position, +description,description +identifier, +name,name +sameAs, +url,projectUrl +relatedLink, +givenName, +familyName, +email, +affiliation, +identifier,id +name, +address, +type, +id, +softwareSuggestions, +maintainer, +contIntegration, +buildInstructions, +developmentStatus, +embargoDate, +funding, +issueTracker, +referencePublication, diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py --- a/swh/indexer/metadata_dictionary/__init__.py +++ b/swh/indexer/metadata_dictionary/__init__.py @@ -8,7 +8,7 @@ import click -from . import cff, codemeta, composer, dart, github, maven, npm, python, ruby +from . import cff, codemeta, composer, dart, github, maven, npm, nuget, python, ruby from .base import BaseExtrinsicMapping, BaseIntrinsicMapping, BaseMapping INTRINSIC_MAPPINGS: Dict[str, Type[BaseIntrinsicMapping]] = { @@ -20,6 +20,7 @@ "PubMapping": dart.PubspecMapping, "PythonPkginfoMapping": python.PythonPkginfoMapping, "ComposerMapping": composer.ComposerMapping, + "NuGetMapping": nuget.NuGetMapping, } EXTRINSIC_MAPPINGS: Dict[str, Type[BaseExtrinsicMapping]] = { diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py new file mode 100644 --- /dev/null +++ b/swh/indexer/metadata_dictionary/nuget.py @@ -0,0 +1,77 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os.path +from typing import Any, Dict, List, Optional + +import xmltodict + +from swh.indexer.codemeta import _DATA_DIR, SCHEMA_URI, _read_crosstable +from swh.indexer.storage.interface import Sha1 + +from .base import DictMapping, DirectoryLsEntry, SingleFileIntrinsicMapping + +NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv") + +with open(NUGET_TABLE_PATH) as fd: + (CODEMETA_TERMS, NUGET_TABLE) = _read_crosstable(fd) + + +class NuGetMapping(DictMapping, SingleFileIntrinsicMapping): + """ + dedicated class for NuGet (.nuspec) mapping and translation + """ + + name = "nuget" + mapping = NUGET_TABLE["NuGet"] + string_fields = [ + "description", + "version", + "projectUrl", + "name", + ] + + @classmethod + def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: + for entry in file_entries: + if entry["name"].endswith(b".nuspec"): + return [entry["sha1"]] + return [] + + def translate(self, content: bytes) -> Optional[Dict[str, Any]]: + d = ( + xmltodict.parse(content.strip(b" \n ")) + .get("package", {}) + .get("metadata", {}) + ) + if not isinstance(d, dict): + self.log.warning("Skipping ill-formed XML content: %s", content) + return None + + return self._translate_dict(d) + + def normalize_projectUrl(self, s): + if isinstance(s, str): + return {"@id": s} + + def translate_repository(self, translated_metadata, v): + if isinstance(v, dict) and isinstance(v["@url"], str): + codemeta_key = self.mapping["repository.url"] + translated_metadata[codemeta_key] = {"@id": v["@url"]} + return translated_metadata + + def normalize_license(self, v): + if isinstance(v, dict) and v["@type"] == "expression": + license_type = v["#text"] + return {"@id": "https://spdx.org/licenses/" + license_type} + + def normalize_authors(self, s): + if isinstance(s, str): + author_names = [a.strip() for a in s.split(",")] + authors = [ + {"@type": SCHEMA_URI + "Person", SCHEMA_URI + "name": name} + for name in author_names + ] + return {"@list": authors} diff --git a/swh/indexer/tests/metadata_dictionary/test_nuget.py b/swh/indexer/tests/metadata_dictionary/test_nuget.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/metadata_dictionary/test_nuget.py @@ -0,0 +1,84 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.indexer.metadata_detector import detect_metadata +from swh.indexer.metadata_dictionary import MAPPINGS + + +def test_compute_metadata_nuget(): + raw_content = b""" + + + + sample + 1.2.3 + Kim Abercrombie, Franck Halmaert + Sample exists only to show a sample .nuspec file. + en-US + http://example.org/ + + MIT + + + + + + + + + """ + result = MAPPINGS["NuGetMapping"]().translate(raw_content) + expected = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [ + {"type": "Person", "name": "Kim Abercrombie"}, + {"type": "Person", "name": "Franck Halmaert"}, + ], + "codeRepository": "https://github.com/NuGet/NuGet.Client.git", + "description": "Sample exists only to show a sample .nuspec file.", + "license": "https://spdx.org/licenses/MIT", + "url": "http://example.org/", + "version": "1.2.3", + } + + assert result == expected + + +@pytest.mark.parametrize( + "filename", + [b"package_name.nuspec", b"number_5.nuspec", b"CAPS.nuspec", b"\x8anan.nuspec"], +) +def test_detect_metadata_package_nuspec(filename): + df = [ + { + "sha1_git": b"abc", + "name": b"example.json", + "target": b"abc", + "length": 897, + "status": "visible", + "type": "file", + "perms": 33188, + "dir_id": b"dir_a", + "sha1": b"bcd", + }, + { + "sha1_git": b"aab", + "name": filename, + "target": b"aab", + "length": 712, + "status": "visible", + "type": "file", + "perms": 33188, + "dir_id": b"dir_a", + "sha1": b"cde", + }, + ] + results = detect_metadata(df) + + expected_results = {"NuGetMapping": [b"cde"]} + assert expected_results == results diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -100,6 +100,7 @@ "github", "maven", "npm", + "nuget", "pkg-info", "pubspec", "",