diff --git a/swh/indexer/data/nuget.csv b/swh/indexer/data/nuget.csv
new file mode 100644
--- /dev/null
+++ b/swh/indexer/data/nuget.csv
@@ -0,0 +1,69 @@
+Property,NuGet
+codeRepository,repository.url
+programmingLanguage,
+runtimePlatform,
+targetProduct,
+applicationCategory,
+applicationSubCategory,
+downloadUrl,
+fileSize,
+installUrl,
+memoryRequirements,
+operatingSystem,
+permissions,
+processorRequirements,
+releaseNotes,
+softwareHelp,
+softwareRequirements,
+softwareVersion,
+storageRequirements,
+supportingData,
+author,authors
+citation,
+contributor,
+copyrightHolder,
+copyrightYear,
+dateCreated,
+dateModified,
+datePublished,
+editor,
+encoding,
+fileFormat,
+funder,
+keywords,
+license,license
+producer,
+provider,
+publisher,
+sponsor,
+version,version
+isAccessibleForFree,
+isPartOf,
+hasPart,
+position,
+description,description
+identifier,
+name,name
+sameAs,
+url,projectUrl
+relatedLink,
+givenName,
+familyName,
+email,
+affiliation,
+identifier,id
+name,
+address,
+type,
+id,
+softwareSuggestions,
+maintainer,
+contIntegration,
+buildInstructions,
+developmentStatus,
+embargoDate,
+funding,
+issueTracker,
+referencePublication,
+readme,readme
+language,language
diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py
--- a/swh/indexer/metadata_dictionary/__init__.py
+++ b/swh/indexer/metadata_dictionary/__init__.py
@@ -8,7 +8,7 @@
import click
-from . import cff, codemeta, composer, dart, github, maven, npm, python, ruby
+from . import cff, codemeta, composer, dart, github, maven, npm, nuget, python, ruby
from .base import BaseExtrinsicMapping, BaseIntrinsicMapping, BaseMapping
INTRINSIC_MAPPINGS: Dict[str, Type[BaseIntrinsicMapping]] = {
@@ -20,6 +20,7 @@
"PubMapping": dart.PubspecMapping,
"PythonPkginfoMapping": python.PythonPkginfoMapping,
"ComposerMapping": composer.ComposerMapping,
+ "NuGetMapping": nuget.NuGetMapping,
}
EXTRINSIC_MAPPINGS: Dict[str, Type[BaseExtrinsicMapping]] = {
diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py
--- a/swh/indexer/metadata_dictionary/base.py
+++ b/swh/indexer/metadata_dictionary/base.py
@@ -6,8 +6,10 @@
import json
import logging
from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar
+import xml.parsers.expat
from typing_extensions import TypedDict
+import xmltodict
import yaml
from swh.indexer.codemeta import SCHEMA_URI, compact, merge_values
@@ -268,3 +270,115 @@
return self._translate_dict(content_dict)
return None
+
+
+class XmlMapping(DictMapping, SingleFileIntrinsicMapping):
+ def translate(self, content: bytes) -> Optional[Dict[str, Any]]:
+ try:
+ d = xmltodict.parse(content).get("project") or {}
+ except xml.parsers.expat.ExpatError:
+ self.log.warning("Error parsing XML from %s", self.log_suffix)
+ return None
+ except UnicodeDecodeError:
+ self.log.warning("Error unidecoding XML from %s", self.log_suffix)
+ return None
+ except (LookupError, ValueError):
+ # unknown encoding or multi-byte encoding
+ self.log.warning("Error detecting XML encoding from %s", self.log_suffix)
+ return None
+ if not isinstance(d, dict):
+ self.log.warning("Skipping ill-formed XML content: %s", content)
+ return None
+ metadata = self._translate_dict(d, normalize=False)
+ metadata[SCHEMA_URI + "codeRepository"] = self.parse_repositories(d)
+ metadata[SCHEMA_URI + "license"] = self.parse_licenses(d)
+ return self.normalize_translation(metadata)
+
+ def parse_repositories(self, d):
+ """https://maven.apache.org/pom.html#Repositories
+
+ >>> import xmltodict
+ >>> from pprint import pprint
+ >>> d = xmltodict.parse('''
+ ...
+ ...
+ ... codehausSnapshots
+ ... Codehaus Snapshots
+ ... http://snapshots.maven.codehaus.org/maven2
+ ... default
+ ...
+ ...
+ ... ''')
+ >>> MavenMapping().parse_repositories(d)
+ """
+ repositories = d.get("repositories")
+ if not repositories:
+ results = [self.parse_repository(d, self._default_repository)]
+ elif isinstance(repositories, dict):
+ repositories = repositories.get("repository") or []
+ if not isinstance(repositories, list):
+ repositories = [repositories]
+ results = [self.parse_repository(d, repo) for repo in repositories]
+ else:
+ results = []
+ return [res for res in results if res] or None
+
+ def parse_licenses(self, d):
+ """https://maven.apache.org/pom.html#Licenses
+
+ >>> import xmltodict
+ >>> import json
+ >>> d = xmltodict.parse('''
+ ...
+ ...
+ ... Apache License, Version 2.0
+ ... https://www.apache.org/licenses/LICENSE-2.0.txt
+ ...
+ ...
+ ... ''')
+ >>> print(json.dumps(d, indent=4))
+ {
+ "licenses": {
+ "license": {
+ "name": "Apache License, Version 2.0",
+ "url": "https://www.apache.org/licenses/LICENSE-2.0.txt"
+ }
+ }
+ }
+ >>> MavenMapping().parse_licenses(d)
+ [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}]
+
+ or, if there are more than one license:
+
+ >>> import xmltodict
+ >>> from pprint import pprint
+ >>> d = xmltodict.parse('''
+ ...
+ ...
+ ... Apache License, Version 2.0
+ ... https://www.apache.org/licenses/LICENSE-2.0.txt
+ ...
+ ...
+ ... MIT License
+ ... https://opensource.org/licenses/MIT
+ ...
+ ...
+ ... ''')
+ >>> pprint(MavenMapping().parse_licenses(d))
+ [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'},
+ {'@id': 'https://opensource.org/licenses/MIT'}]
+ """
+
+ licenses = d.get("licenses")
+ if not isinstance(licenses, dict):
+ return
+ licenses = licenses.get("license")
+ if isinstance(licenses, dict):
+ licenses = [licenses]
+ elif not isinstance(licenses, list):
+ return
+ return [
+ {"@id": license["url"]}
+ for license in licenses
+ if isinstance(license, dict) and isinstance(license.get("url"), str)
+ ] or None
diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py
--- a/swh/indexer/metadata_dictionary/maven.py
+++ b/swh/indexer/metadata_dictionary/maven.py
@@ -3,18 +3,15 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import os
-from typing import Any, Dict, Optional
-import xml.parsers.expat
-import xmltodict
+import os
-from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
+from swh.indexer.codemeta import CROSSWALK_TABLE
-from .base import DictMapping, SingleFileIntrinsicMapping
+from .base import XmlMapping
-class MavenMapping(DictMapping, SingleFileIntrinsicMapping):
+class MavenMapping(XmlMapping):
"""
dedicated class for Maven (pom.xml) mapping and translation
"""
@@ -23,58 +20,16 @@
filename = b"pom.xml"
mapping = CROSSWALK_TABLE["Java (Maven)"]
string_fields = ["name", "version", "description", "email"]
-
- def translate(self, content: bytes) -> Optional[Dict[str, Any]]:
- try:
- d = xmltodict.parse(content).get("project") or {}
- except xml.parsers.expat.ExpatError:
- self.log.warning("Error parsing XML from %s", self.log_suffix)
- return None
- except UnicodeDecodeError:
- self.log.warning("Error unidecoding XML from %s", self.log_suffix)
- return None
- except (LookupError, ValueError):
- # unknown encoding or multi-byte encoding
- self.log.warning("Error detecting XML encoding from %s", self.log_suffix)
- return None
- if not isinstance(d, dict):
- self.log.warning("Skipping ill-formed XML content: %s", content)
- return None
- metadata = self._translate_dict(d, normalize=False)
- metadata[SCHEMA_URI + "codeRepository"] = self.parse_repositories(d)
- metadata[SCHEMA_URI + "license"] = self.parse_licenses(d)
- return self.normalize_translation(metadata)
-
_default_repository = {"url": "https://repo.maven.apache.org/maven2/"}
- def parse_repositories(self, d):
- """https://maven.apache.org/pom.html#Repositories
+ def normalize_groupId(self, id_):
+ """https://maven.apache.org/pom.html#Maven_Coordinates
- >>> import xmltodict
- >>> from pprint import pprint
- >>> d = xmltodict.parse('''
- ...
- ...
- ... codehausSnapshots
- ... Codehaus Snapshots
- ... http://snapshots.maven.codehaus.org/maven2
- ... default
- ...
- ...
- ... ''')
- >>> MavenMapping().parse_repositories(d)
+ >>> MavenMapping().normalize_groupId('org.example')
+ {'@id': 'org.example'}
"""
- repositories = d.get("repositories")
- if not repositories:
- results = [self.parse_repository(d, self._default_repository)]
- elif isinstance(repositories, dict):
- repositories = repositories.get("repository") or []
- if not isinstance(repositories, list):
- repositories = [repositories]
- results = [self.parse_repository(d, repo) for repo in repositories]
- else:
- results = []
- return [res for res in results if res] or None
+ if isinstance(id_, str):
+ return {"@id": id_}
def parse_repository(self, d, repo):
if not isinstance(repo, dict):
@@ -91,72 +46,3 @@
):
repo = os.path.join(url, *group_id.split("."), artifact_id)
return {"@id": repo}
-
- def normalize_groupId(self, id_):
- """https://maven.apache.org/pom.html#Maven_Coordinates
-
- >>> MavenMapping().normalize_groupId('org.example')
- {'@id': 'org.example'}
- """
- if isinstance(id_, str):
- return {"@id": id_}
-
- def parse_licenses(self, d):
- """https://maven.apache.org/pom.html#Licenses
-
- >>> import xmltodict
- >>> import json
- >>> d = xmltodict.parse('''
- ...
- ...
- ... Apache License, Version 2.0
- ... https://www.apache.org/licenses/LICENSE-2.0.txt
- ...
- ...
- ... ''')
- >>> print(json.dumps(d, indent=4))
- {
- "licenses": {
- "license": {
- "name": "Apache License, Version 2.0",
- "url": "https://www.apache.org/licenses/LICENSE-2.0.txt"
- }
- }
- }
- >>> MavenMapping().parse_licenses(d)
- [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}]
-
- or, if there are more than one license:
-
- >>> import xmltodict
- >>> from pprint import pprint
- >>> d = xmltodict.parse('''
- ...
- ...
- ... Apache License, Version 2.0
- ... https://www.apache.org/licenses/LICENSE-2.0.txt
- ...
- ...
- ... MIT License
- ... https://opensource.org/licenses/MIT
- ...
- ...
- ... ''')
- >>> pprint(MavenMapping().parse_licenses(d))
- [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'},
- {'@id': 'https://opensource.org/licenses/MIT'}]
- """
-
- licenses = d.get("licenses")
- if not isinstance(licenses, dict):
- return
- licenses = licenses.get("license")
- if isinstance(licenses, dict):
- licenses = [licenses]
- elif not isinstance(licenses, list):
- return
- return [
- {"@id": license["url"]}
- for license in licenses
- if isinstance(license, dict) and isinstance(license.get("url"), str)
- ] or None
diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/metadata_dictionary/nuget.py
@@ -0,0 +1,34 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os.path
+
+from swh.indexer.codemeta import _DATA_DIR, _read_crosstable
+
+from .base import XmlMapping
+
+NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv")
+
+with open(NUGET_TABLE_PATH) as fd:
+ (CODEMETA_TERMS, NUGET_TABLE) = _read_crosstable(fd)
+
+
+class NuGetMapping(XmlMapping):
+ """
+ dedicated class for Maven (pom.xml) mapping and translation
+ """
+
+ name = "nuget"
+ filename = b".nuspec"
+ mapping = NUGET_TABLE["NuGet"]
+ string_fields = [
+ "license",
+ "description",
+ "name",
+ "projectUrl",
+ "id",
+ "readme",
+ ]
+ _default_repository = {"url": "https://repo.maven.apache.org/maven2/"}
diff --git a/swh/indexer/tests/metadata_dictionary/test_nuget.py b/swh/indexer/tests/metadata_dictionary/test_nuget.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/tests/metadata_dictionary/test_nuget.py
@@ -0,0 +1,49 @@
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.indexer.metadata_dictionary import MAPPINGS
+
+
+def test_compute_metadata_nuget():
+ raw_content = """
+
+
+
+ sample
+ 1.2.3
+ Kim Abercrombie, Franck Halmaert
+ Sample exists only to show a sample .nuspec file.
+ en-US
+ http://example.org/
+ MIT
+
+
+
+
+
+
+
+
+
+ """.encode(
+ "utf-8"
+ )
+
+ result = MAPPINGS["NuGetMapping"]().translate(raw_content)
+
+ expected = {
+ "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+ "type": "SoftwareSourceCode",
+ "author": [
+ {"type": "Person", "name": "Kim Abercrombie"},
+ {"type": "Person", "name": "Franck Halmaert"},
+ ],
+ "description": "Sample exists only to show a sample .nuspec file.",
+ "license": "https://spdx.org/licenses/MIT",
+ "url": "http://example.org/",
+ "version": "1.2.3",
+ }
+
+ assert result == expected
diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py
--- a/swh/indexer/tests/test_cli.py
+++ b/swh/indexer/tests/test_cli.py
@@ -100,6 +100,7 @@
"github",
"maven",
"npm",
+ "nuget",
"pkg-info",
"pubspec",
"",