diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py index b2578ed..05b95d4 100644 --- a/swh/indexer/metadata_dictionary/nuget.py +++ b/swh/indexer/metadata_dictionary/nuget.py @@ -1,109 +1,105 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os.path import re from typing import Any, Dict, List, Optional import xmltodict from swh.indexer.codemeta import _DATA_DIR, _read_crosstable from swh.indexer.namespaces import SCHEMA from swh.indexer.storage.interface import Sha1 from .base import BaseIntrinsicMapping, DictMapping, DirectoryLsEntry NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv") with open(NUGET_TABLE_PATH) as fd: (CODEMETA_TERMS, NUGET_TABLE) = _read_crosstable(fd) class NuGetMapping(DictMapping, BaseIntrinsicMapping): """ dedicated class for NuGet (.nuspec) mapping and translation """ name = "nuget" mapping = NUGET_TABLE["NuGet"] mapping["copyright"] = "http://schema.org/copyrightNotice" mapping["language"] = "http://schema.org/inLanguage" string_fields = [ "description", "version", "projectUrl", "name", "tags", "license", "licenseUrl", "summary", "copyright", "language", ] @classmethod def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: for entry in file_entries: if entry["name"].endswith(b".nuspec"): return [entry["sha1"]] return [] def translate(self, content: bytes) -> Optional[Dict[str, Any]]: - d = ( - xmltodict.parse(content.strip(b" \n ")) - .get("package", {}) - .get("metadata", {}) - ) + d = xmltodict.parse(content).get("package", {}).get("metadata", {}) if not isinstance(d, dict): self.log.warning("Skipping ill-formed XML content: %s", content) return None return self._translate_dict(d) def normalize_projectUrl(self, s): if isinstance(s, str): return {"@id": s} def translate_repository(self, translated_metadata, v): if isinstance(v, dict) and isinstance(v["@url"], str): codemeta_key = self.mapping["repository.url"] translated_metadata[codemeta_key] = {"@id": v["@url"]} def normalize_license(self, v): if isinstance(v, dict) and v["@type"] == "expression": license_string = v["#text"] if not bool( re.search(r" with |\(|\)| and ", license_string, re.IGNORECASE) ): return [ {"@id": "https://spdx.org/licenses/" + license_type.strip()} for license_type in re.split( r" or ", license_string, flags=re.IGNORECASE ) ] else: return None def normalize_licenseUrl(self, s): if isinstance(s, str): return {"@id": s} def normalize_authors(self, s): if isinstance(s, str): author_names = [a.strip() for a in s.split(",")] authors = [ {"@type": SCHEMA.Person, SCHEMA.name: name} for name in author_names ] return {"@list": authors} def translate_releaseNotes(self, translated_metadata, s): if isinstance(s, str): translated_metadata.setdefault("http://schema.org/releaseNotes", []).append( s ) def normalize_tags(self, s): if isinstance(s, str): return s.split(" ") diff --git a/swh/indexer/tests/metadata_dictionary/test_nuget.py b/swh/indexer/tests/metadata_dictionary/test_nuget.py index 58e01bc..f34c550 100644 --- a/swh/indexer/tests/metadata_dictionary/test_nuget.py +++ b/swh/indexer/tests/metadata_dictionary/test_nuget.py @@ -1,171 +1,167 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.indexer.metadata_detector import detect_metadata from swh.indexer.metadata_dictionary import MAPPINGS def test_compute_metadata_nuget(): - raw_content = b""" - + raw_content = b""" sample 1.2.3 Kim Abercrombie, Franck Halmaert Sample exists only to show a sample .nuspec file. Summary is being deprecated. Use description instead. http://example.org/ MIT https://raw.github.com/timrwood/moment/master/LICENSE See the [changelog](https://github.com/httpie/httpie/releases/tag/3.2.0). python3 java cpp search-tag """ result = MAPPINGS["NuGetMapping"]().translate(raw_content) expected = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "author": [ {"type": "Person", "name": "Kim Abercrombie"}, {"type": "Person", "name": "Franck Halmaert"}, ], "codeRepository": "https://github.com/NuGet/NuGet.Client.git", "description": [ "Sample exists only to show a sample .nuspec file.", "Summary is being deprecated. Use description instead.", ], "license": [ "https://spdx.org/licenses/MIT", "https://raw.github.com/timrwood/moment/master/LICENSE", ], "url": "http://example.org/", "version": "1.2.3", "schema:releaseNotes": ( "See the [changelog](https://github.com/httpie/httpie/releases/tag/3.2.0)." ), "keywords": [ "python3", "java", "cpp", "search-tag", ], } assert result == expected @pytest.mark.parametrize( "filename", [b"package_name.nuspec", b"number_5.nuspec", b"CAPS.nuspec", b"\x8anan.nuspec"], ) def test_detect_metadata_package_nuspec(filename): df = [ { "sha1_git": b"abc", "name": b"example.json", "target": b"abc", "length": 897, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"bcd", }, { "sha1_git": b"aab", "name": filename, "target": b"aab", "length": 712, "status": "visible", "type": "file", "perms": 33188, "dir_id": b"dir_a", "sha1": b"cde", }, ] results = detect_metadata(df) expected_results = {"NuGetMapping": [b"cde"]} assert expected_results == results def test_normalize_license_multiple_licenses_or_delimiter(): - raw_content = raw_content = b""" - + raw_content = raw_content = b""" BitTorrent-1.0 or GPL-3.0-with-GCC-exception """ result = MAPPINGS["NuGetMapping"]().translate(raw_content) expected = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "license": [ "https://spdx.org/licenses/BitTorrent-1.0", "https://spdx.org/licenses/GPL-3.0-with-GCC-exception", ], } assert result == expected def test_normalize_license_unsupported_delimiter(): - raw_content = raw_content = b""" - + raw_content = raw_content = b""" (MIT) """ result = MAPPINGS["NuGetMapping"]().translate(raw_content) expected = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", } assert result == expected def test_copyrightNotice_absolute_uri_property(): - raw_content = raw_content = b""" - + raw_content = raw_content = b""" Copyright 2017-2022 en-us """ result = MAPPINGS["NuGetMapping"]().translate(raw_content) expected = { "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "type": "SoftwareSourceCode", "schema:copyrightNotice": "Copyright 2017-2022", "schema:inLanguage": "en-us", } assert result == expected