diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py
index b2578ed..05b95d4 100644
--- a/swh/indexer/metadata_dictionary/nuget.py
+++ b/swh/indexer/metadata_dictionary/nuget.py
@@ -1,109 +1,105 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os.path
import re
from typing import Any, Dict, List, Optional
import xmltodict
from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
from swh.indexer.namespaces import SCHEMA
from swh.indexer.storage.interface import Sha1
from .base import BaseIntrinsicMapping, DictMapping, DirectoryLsEntry
NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv")
with open(NUGET_TABLE_PATH) as fd:
(CODEMETA_TERMS, NUGET_TABLE) = _read_crosstable(fd)
class NuGetMapping(DictMapping, BaseIntrinsicMapping):
"""
dedicated class for NuGet (.nuspec) mapping and translation
"""
name = "nuget"
mapping = NUGET_TABLE["NuGet"]
mapping["copyright"] = "http://schema.org/copyrightNotice"
mapping["language"] = "http://schema.org/inLanguage"
string_fields = [
"description",
"version",
"projectUrl",
"name",
"tags",
"license",
"licenseUrl",
"summary",
"copyright",
"language",
]
@classmethod
def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]:
for entry in file_entries:
if entry["name"].endswith(b".nuspec"):
return [entry["sha1"]]
return []
def translate(self, content: bytes) -> Optional[Dict[str, Any]]:
- d = (
- xmltodict.parse(content.strip(b" \n "))
- .get("package", {})
- .get("metadata", {})
- )
+ d = xmltodict.parse(content).get("package", {}).get("metadata", {})
if not isinstance(d, dict):
self.log.warning("Skipping ill-formed XML content: %s", content)
return None
return self._translate_dict(d)
def normalize_projectUrl(self, s):
if isinstance(s, str):
return {"@id": s}
def translate_repository(self, translated_metadata, v):
if isinstance(v, dict) and isinstance(v["@url"], str):
codemeta_key = self.mapping["repository.url"]
translated_metadata[codemeta_key] = {"@id": v["@url"]}
def normalize_license(self, v):
if isinstance(v, dict) and v["@type"] == "expression":
license_string = v["#text"]
if not bool(
re.search(r" with |\(|\)| and ", license_string, re.IGNORECASE)
):
return [
{"@id": "https://spdx.org/licenses/" + license_type.strip()}
for license_type in re.split(
r" or ", license_string, flags=re.IGNORECASE
)
]
else:
return None
def normalize_licenseUrl(self, s):
if isinstance(s, str):
return {"@id": s}
def normalize_authors(self, s):
if isinstance(s, str):
author_names = [a.strip() for a in s.split(",")]
authors = [
{"@type": SCHEMA.Person, SCHEMA.name: name} for name in author_names
]
return {"@list": authors}
def translate_releaseNotes(self, translated_metadata, s):
if isinstance(s, str):
translated_metadata.setdefault("http://schema.org/releaseNotes", []).append(
s
)
def normalize_tags(self, s):
if isinstance(s, str):
return s.split(" ")
diff --git a/swh/indexer/tests/metadata_dictionary/test_nuget.py b/swh/indexer/tests/metadata_dictionary/test_nuget.py
index 58e01bc..f34c550 100644
--- a/swh/indexer/tests/metadata_dictionary/test_nuget.py
+++ b/swh/indexer/tests/metadata_dictionary/test_nuget.py
@@ -1,171 +1,167 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
from swh.indexer.metadata_detector import detect_metadata
from swh.indexer.metadata_dictionary import MAPPINGS
def test_compute_metadata_nuget():
- raw_content = b"""
-
+ raw_content = b"""
sample
1.2.3
Kim Abercrombie, Franck Halmaert
Sample exists only to show a sample .nuspec file.
Summary is being deprecated. Use description instead.
http://example.org/
MIT
https://raw.github.com/timrwood/moment/master/LICENSE
See the [changelog](https://github.com/httpie/httpie/releases/tag/3.2.0).
python3 java cpp search-tag
"""
result = MAPPINGS["NuGetMapping"]().translate(raw_content)
expected = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"author": [
{"type": "Person", "name": "Kim Abercrombie"},
{"type": "Person", "name": "Franck Halmaert"},
],
"codeRepository": "https://github.com/NuGet/NuGet.Client.git",
"description": [
"Sample exists only to show a sample .nuspec file.",
"Summary is being deprecated. Use description instead.",
],
"license": [
"https://spdx.org/licenses/MIT",
"https://raw.github.com/timrwood/moment/master/LICENSE",
],
"url": "http://example.org/",
"version": "1.2.3",
"schema:releaseNotes": (
"See the [changelog](https://github.com/httpie/httpie/releases/tag/3.2.0)."
),
"keywords": [
"python3",
"java",
"cpp",
"search-tag",
],
}
assert result == expected
@pytest.mark.parametrize(
"filename",
[b"package_name.nuspec", b"number_5.nuspec", b"CAPS.nuspec", b"\x8anan.nuspec"],
)
def test_detect_metadata_package_nuspec(filename):
df = [
{
"sha1_git": b"abc",
"name": b"example.json",
"target": b"abc",
"length": 897,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"bcd",
},
{
"sha1_git": b"aab",
"name": filename,
"target": b"aab",
"length": 712,
"status": "visible",
"type": "file",
"perms": 33188,
"dir_id": b"dir_a",
"sha1": b"cde",
},
]
results = detect_metadata(df)
expected_results = {"NuGetMapping": [b"cde"]}
assert expected_results == results
def test_normalize_license_multiple_licenses_or_delimiter():
- raw_content = raw_content = b"""
-
+ raw_content = raw_content = b"""
BitTorrent-1.0 or GPL-3.0-with-GCC-exception
"""
result = MAPPINGS["NuGetMapping"]().translate(raw_content)
expected = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"license": [
"https://spdx.org/licenses/BitTorrent-1.0",
"https://spdx.org/licenses/GPL-3.0-with-GCC-exception",
],
}
assert result == expected
def test_normalize_license_unsupported_delimiter():
- raw_content = raw_content = b"""
-
+ raw_content = raw_content = b"""
(MIT)
"""
result = MAPPINGS["NuGetMapping"]().translate(raw_content)
expected = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
}
assert result == expected
def test_copyrightNotice_absolute_uri_property():
- raw_content = raw_content = b"""
-
+ raw_content = raw_content = b"""
Copyright 2017-2022
en-us
"""
result = MAPPINGS["NuGetMapping"]().translate(raw_content)
expected = {
"@context": "https://doi.org/10.5063/schema/codemeta-2.0",
"type": "SoftwareSourceCode",
"schema:copyrightNotice": "Copyright 2017-2022",
"schema:inLanguage": "en-us",
}
assert result == expected