diff --git a/swh/indexer/data/nuget.csv b/swh/indexer/data/nuget.csv new file mode 100644 --- /dev/null +++ b/swh/indexer/data/nuget.csv @@ -0,0 +1,69 @@ +Property,NuGet +codeRepository,repository.url +programmingLanguage, +runtimePlatform, +targetProduct, +applicationCategory, +applicationSubCategory, +downloadUrl, +fileSize, +installUrl, +memoryRequirements, +operatingSystem, +permissions, +processorRequirements, +releaseNotes, +softwareHelp, +softwareRequirements, +softwareVersion, +storageRequirements, +supportingData, +author,authors +citation, +contributor, +copyrightHolder, +copyrightYear, +dateCreated, +dateModified, +datePublished, +editor, +encoding, +fileFormat, +funder, +keywords, +license,license +producer, +provider, +publisher, +sponsor, +version,version +isAccessibleForFree, +isPartOf, +hasPart, +position, +description,description +identifier, +name,name +sameAs, +url,projectUrl +relatedLink, +givenName, +familyName, +email, +affiliation, +identifier,id +name, +address, +type, +id, +softwareSuggestions, +maintainer, +contIntegration, +buildInstructions, +developmentStatus, +embargoDate, +funding, +issueTracker, +referencePublication, +readme,readme +language,language diff --git a/swh/indexer/metadata_dictionary/__init__.py b/swh/indexer/metadata_dictionary/__init__.py --- a/swh/indexer/metadata_dictionary/__init__.py +++ b/swh/indexer/metadata_dictionary/__init__.py @@ -8,7 +8,7 @@ import click -from . import cff, codemeta, composer, dart, github, maven, npm, python, ruby +from . import cff, codemeta, composer, dart, github, maven, npm, nuget, python, ruby from .base import BaseExtrinsicMapping, BaseIntrinsicMapping, BaseMapping INTRINSIC_MAPPINGS: Dict[str, Type[BaseIntrinsicMapping]] = { @@ -20,6 +20,7 @@ "PubMapping": dart.PubspecMapping, "PythonPkginfoMapping": python.PythonPkginfoMapping, "ComposerMapping": composer.ComposerMapping, + "NuGetMapping": nuget.NuGetMapping, } EXTRINSIC_MAPPINGS: Dict[str, Type[BaseExtrinsicMapping]] = { diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -6,8 +6,10 @@ import json import logging from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar +import xml.parsers.expat from typing_extensions import TypedDict +import xmltodict import yaml from swh.indexer.codemeta import SCHEMA_URI, compact, merge_values @@ -268,3 +270,115 @@ return self._translate_dict(content_dict) return None + + +class XmlMapping(DictMapping, SingleFileIntrinsicMapping): + def translate(self, content: bytes) -> Optional[Dict[str, Any]]: + try: + d = xmltodict.parse(content).get("project") or {} + except xml.parsers.expat.ExpatError: + self.log.warning("Error parsing XML from %s", self.log_suffix) + return None + except UnicodeDecodeError: + self.log.warning("Error unidecoding XML from %s", self.log_suffix) + return None + except (LookupError, ValueError): + # unknown encoding or multi-byte encoding + self.log.warning("Error detecting XML encoding from %s", self.log_suffix) + return None + if not isinstance(d, dict): + self.log.warning("Skipping ill-formed XML content: %s", content) + return None + metadata = self._translate_dict(d, normalize=False) + metadata[SCHEMA_URI + "codeRepository"] = self.parse_repositories(d) + metadata[SCHEMA_URI + "license"] = self.parse_licenses(d) + return self.normalize_translation(metadata) + + def parse_repositories(self, d): + """https://maven.apache.org/pom.html#Repositories + + >>> import xmltodict + >>> from pprint import pprint + >>> d = xmltodict.parse(''' + ... + ... + ... codehausSnapshots + ... Codehaus Snapshots + ... http://snapshots.maven.codehaus.org/maven2 + ... default + ... + ... + ... ''') + >>> MavenMapping().parse_repositories(d) + """ + repositories = d.get("repositories") + if not repositories: + results = [self.parse_repository(d, self._default_repository)] + elif isinstance(repositories, dict): + repositories = repositories.get("repository") or [] + if not isinstance(repositories, list): + repositories = [repositories] + results = [self.parse_repository(d, repo) for repo in repositories] + else: + results = [] + return [res for res in results if res] or None + + def parse_licenses(self, d): + """https://maven.apache.org/pom.html#Licenses + + >>> import xmltodict + >>> import json + >>> d = xmltodict.parse(''' + ... + ... + ... Apache License, Version 2.0 + ... https://www.apache.org/licenses/LICENSE-2.0.txt + ... + ... + ... ''') + >>> print(json.dumps(d, indent=4)) + { + "licenses": { + "license": { + "name": "Apache License, Version 2.0", + "url": "https://www.apache.org/licenses/LICENSE-2.0.txt" + } + } + } + >>> MavenMapping().parse_licenses(d) + [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}] + + or, if there are more than one license: + + >>> import xmltodict + >>> from pprint import pprint + >>> d = xmltodict.parse(''' + ... + ... + ... Apache License, Version 2.0 + ... https://www.apache.org/licenses/LICENSE-2.0.txt + ... + ... + ... MIT License + ... https://opensource.org/licenses/MIT + ... + ... + ... ''') + >>> pprint(MavenMapping().parse_licenses(d)) + [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, + {'@id': 'https://opensource.org/licenses/MIT'}] + """ + + licenses = d.get("licenses") + if not isinstance(licenses, dict): + return + licenses = licenses.get("license") + if isinstance(licenses, dict): + licenses = [licenses] + elif not isinstance(licenses, list): + return + return [ + {"@id": license["url"]} + for license in licenses + if isinstance(license, dict) and isinstance(license.get("url"), str) + ] or None diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py --- a/swh/indexer/metadata_dictionary/maven.py +++ b/swh/indexer/metadata_dictionary/maven.py @@ -3,18 +3,15 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import os -from typing import Any, Dict, Optional -import xml.parsers.expat -import xmltodict +import os -from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from swh.indexer.codemeta import CROSSWALK_TABLE -from .base import DictMapping, SingleFileIntrinsicMapping +from .base import XmlMapping -class MavenMapping(DictMapping, SingleFileIntrinsicMapping): +class MavenMapping(XmlMapping): """ dedicated class for Maven (pom.xml) mapping and translation """ @@ -23,58 +20,16 @@ filename = b"pom.xml" mapping = CROSSWALK_TABLE["Java (Maven)"] string_fields = ["name", "version", "description", "email"] - - def translate(self, content: bytes) -> Optional[Dict[str, Any]]: - try: - d = xmltodict.parse(content).get("project") or {} - except xml.parsers.expat.ExpatError: - self.log.warning("Error parsing XML from %s", self.log_suffix) - return None - except UnicodeDecodeError: - self.log.warning("Error unidecoding XML from %s", self.log_suffix) - return None - except (LookupError, ValueError): - # unknown encoding or multi-byte encoding - self.log.warning("Error detecting XML encoding from %s", self.log_suffix) - return None - if not isinstance(d, dict): - self.log.warning("Skipping ill-formed XML content: %s", content) - return None - metadata = self._translate_dict(d, normalize=False) - metadata[SCHEMA_URI + "codeRepository"] = self.parse_repositories(d) - metadata[SCHEMA_URI + "license"] = self.parse_licenses(d) - return self.normalize_translation(metadata) - _default_repository = {"url": "https://repo.maven.apache.org/maven2/"} - def parse_repositories(self, d): - """https://maven.apache.org/pom.html#Repositories + def normalize_groupId(self, id_): + """https://maven.apache.org/pom.html#Maven_Coordinates - >>> import xmltodict - >>> from pprint import pprint - >>> d = xmltodict.parse(''' - ... - ... - ... codehausSnapshots - ... Codehaus Snapshots - ... http://snapshots.maven.codehaus.org/maven2 - ... default - ... - ... - ... ''') - >>> MavenMapping().parse_repositories(d) + >>> MavenMapping().normalize_groupId('org.example') + {'@id': 'org.example'} """ - repositories = d.get("repositories") - if not repositories: - results = [self.parse_repository(d, self._default_repository)] - elif isinstance(repositories, dict): - repositories = repositories.get("repository") or [] - if not isinstance(repositories, list): - repositories = [repositories] - results = [self.parse_repository(d, repo) for repo in repositories] - else: - results = [] - return [res for res in results if res] or None + if isinstance(id_, str): + return {"@id": id_} def parse_repository(self, d, repo): if not isinstance(repo, dict): @@ -91,72 +46,3 @@ ): repo = os.path.join(url, *group_id.split("."), artifact_id) return {"@id": repo} - - def normalize_groupId(self, id_): - """https://maven.apache.org/pom.html#Maven_Coordinates - - >>> MavenMapping().normalize_groupId('org.example') - {'@id': 'org.example'} - """ - if isinstance(id_, str): - return {"@id": id_} - - def parse_licenses(self, d): - """https://maven.apache.org/pom.html#Licenses - - >>> import xmltodict - >>> import json - >>> d = xmltodict.parse(''' - ... - ... - ... Apache License, Version 2.0 - ... https://www.apache.org/licenses/LICENSE-2.0.txt - ... - ... - ... ''') - >>> print(json.dumps(d, indent=4)) - { - "licenses": { - "license": { - "name": "Apache License, Version 2.0", - "url": "https://www.apache.org/licenses/LICENSE-2.0.txt" - } - } - } - >>> MavenMapping().parse_licenses(d) - [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}] - - or, if there are more than one license: - - >>> import xmltodict - >>> from pprint import pprint - >>> d = xmltodict.parse(''' - ... - ... - ... Apache License, Version 2.0 - ... https://www.apache.org/licenses/LICENSE-2.0.txt - ... - ... - ... MIT License - ... https://opensource.org/licenses/MIT - ... - ... - ... ''') - >>> pprint(MavenMapping().parse_licenses(d)) - [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, - {'@id': 'https://opensource.org/licenses/MIT'}] - """ - - licenses = d.get("licenses") - if not isinstance(licenses, dict): - return - licenses = licenses.get("license") - if isinstance(licenses, dict): - licenses = [licenses] - elif not isinstance(licenses, list): - return - return [ - {"@id": license["url"]} - for license in licenses - if isinstance(license, dict) and isinstance(license.get("url"), str) - ] or None diff --git a/swh/indexer/metadata_dictionary/nuget.py b/swh/indexer/metadata_dictionary/nuget.py new file mode 100644 --- /dev/null +++ b/swh/indexer/metadata_dictionary/nuget.py @@ -0,0 +1,34 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os.path + +from swh.indexer.codemeta import _DATA_DIR, _read_crosstable + +from .base import XmlMapping + +NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv") + +with open(NUGET_TABLE_PATH) as fd: + (CODEMETA_TERMS, NUGET_TABLE) = _read_crosstable(fd) + + +class NuGetMapping(XmlMapping): + """ + dedicated class for Maven (pom.xml) mapping and translation + """ + + name = "nuget" + filename = b".nuspec" + mapping = NUGET_TABLE["NuGet"] + string_fields = [ + "license", + "description", + "name", + "projectUrl", + "id", + "readme", + ] + _default_repository = {"url": "https://repo.maven.apache.org/maven2/"} diff --git a/swh/indexer/tests/metadata_dictionary/test_nuget.py b/swh/indexer/tests/metadata_dictionary/test_nuget.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/metadata_dictionary/test_nuget.py @@ -0,0 +1,49 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.indexer.metadata_dictionary import MAPPINGS + + +def test_compute_metadata_nuget(): + raw_content = """ + + + + sample + 1.2.3 + Kim Abercrombie, Franck Halmaert + Sample exists only to show a sample .nuspec file. + en-US + http://example.org/ + MIT + + + + + + + + + + """.encode( + "utf-8" + ) + + result = MAPPINGS["NuGetMapping"]().translate(raw_content) + + expected = { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "type": "SoftwareSourceCode", + "author": [ + {"type": "Person", "name": "Kim Abercrombie"}, + {"type": "Person", "name": "Franck Halmaert"}, + ], + "description": "Sample exists only to show a sample .nuspec file.", + "license": "https://spdx.org/licenses/MIT", + "url": "http://example.org/", + "version": "1.2.3", + } + + assert result == expected diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -100,6 +100,7 @@ "github", "maven", "npm", + "nuget", "pkg-info", "pubspec", "",