Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata_dictionary/maven.py
# Copyright (C) 2018-2019 The Software Heritage developers | # Copyright (C) 2018-2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import os | import os | ||||
from typing import Any, Dict, List, Optional | |||||
import xml.parsers.expat | import xml.parsers.expat | ||||
import xmltodict | import xmltodict | ||||
from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI | from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI | ||||
from .base import DictMapping, SingleFileMapping | from .base import DictMapping, SingleFileMapping | ||||
class MavenMapping(DictMapping, SingleFileMapping): | class MavenMapping(DictMapping, SingleFileMapping): | ||||
""" | """ | ||||
dedicated class for Maven (pom.xml) mapping and translation | dedicated class for Maven (pom.xml) mapping and translation | ||||
""" | """ | ||||
name = "maven" | name = "maven" | ||||
filename = b"pom.xml" | filename = b"pom.xml" | ||||
mapping = CROSSWALK_TABLE["Java (Maven)"] | mapping = CROSSWALK_TABLE["Java (Maven)"] | ||||
string_fields = ["name", "version", "description", "email"] | string_fields = ["name", "version", "description", "email"] | ||||
def translate(self, content): | def translate(self, content: bytes) -> Optional[Dict[str, Any]]: | ||||
try: | try: | ||||
d = xmltodict.parse(content).get("project") or {} | d = xmltodict.parse(content).get("project") or {} | ||||
except xml.parsers.expat.ExpatError: | except xml.parsers.expat.ExpatError: | ||||
self.log.warning("Error parsing XML from %s", self.log_suffix) | self.log.warning("Error parsing XML from %s", self.log_suffix) | ||||
return None | return None | ||||
except UnicodeDecodeError: | except UnicodeDecodeError: | ||||
self.log.warning("Error unidecoding XML from %s", self.log_suffix) | self.log.warning("Error unidecoding XML from %s", self.log_suffix) | ||||
return None | return None | ||||
except (LookupError, ValueError): | except (LookupError, ValueError): | ||||
# unknown encoding or multi-byte encoding | # unknown encoding or multi-byte encoding | ||||
self.log.warning("Error detecting XML encoding from %s", self.log_suffix) | self.log.warning("Error detecting XML encoding from %s", self.log_suffix) | ||||
return None | return None | ||||
metadata = self._translate_dict(d, normalize=False) | metadata = self._translate_dict(d, normalize=False) | ||||
metadata[SCHEMA_URI + "codeRepository"] = self.parse_repositories(d) | metadata[SCHEMA_URI + "codeRepository"] = self.parse_repositories(d) | ||||
metadata[SCHEMA_URI + "license"] = self.parse_licenses(d) | metadata[SCHEMA_URI + "license"] = self.parse_licenses(d) | ||||
return self.normalize_translation(metadata) | return self.normalize_translation(metadata) | ||||
_default_repository = {"url": "https://repo.maven.apache.org/maven2/"} | _default_repository = {"url": "https://repo.maven.apache.org/maven2/"} | ||||
def parse_repositories(self, d): | def parse_repositories(self, d: Dict[str, Any]) -> Optional[List[Any]]: | ||||
"""https://maven.apache.org/pom.html#Repositories | """https://maven.apache.org/pom.html#Repositories | ||||
>>> import xmltodict | >>> import xmltodict | ||||
>>> from pprint import pprint | >>> from pprint import pprint | ||||
>>> d = xmltodict.parse(''' | >>> d = xmltodict.parse(''' | ||||
... <repositories> | ... <repositories> | ||||
... <repository> | ... <repository> | ||||
... <id>codehausSnapshots</id> | ... <id>codehausSnapshots</id> | ||||
Show All 12 Lines | def parse_repositories(self, d: Dict[str, Any]) -> Optional[List[Any]]: | ||||
repositories = repositories.get("repository") or [] | repositories = repositories.get("repository") or [] | ||||
if not isinstance(repositories, list): | if not isinstance(repositories, list): | ||||
repositories = [repositories] | repositories = [repositories] | ||||
results = [self.parse_repository(d, repo) for repo in repositories] | results = [self.parse_repository(d, repo) for repo in repositories] | ||||
else: | else: | ||||
results = [] | results = [] | ||||
return [res for res in results if res] or None | return [res for res in results if res] or None | ||||
def parse_repository(self, d, repo): | def parse_repository(self, d: Dict[str, Any], repo: Dict[str, Any]) -> Any: | ||||
if not isinstance(repo, dict): | if not isinstance(repo, dict): | ||||
return | return | ||||
if repo.get("layout", "default") != "default": | if repo.get("layout", "default") != "default": | ||||
return # TODO ? | return None # TODO ? | ||||
url = repo.get("url") | url = repo.get("url") | ||||
group_id = d.get("groupId") | group_id = d.get("groupId") | ||||
artifact_id = d.get("artifactId") | artifact_id = d.get("artifactId") | ||||
if ( | if ( | ||||
isinstance(url, str) | isinstance(url, str) | ||||
and isinstance(group_id, str) | and isinstance(group_id, str) | ||||
and isinstance(artifact_id, str) | and isinstance(artifact_id, str) | ||||
): | ): | ||||
repo = os.path.join(url, *group_id.split("."), artifact_id) | return {"@id": os.path.join(url, *group_id.split("."), artifact_id)} | ||||
return {"@id": repo} | |||||
def normalize_groupId(self, id_): | def normalize_groupId(self, id_: str) -> Dict[str, str]: | ||||
"""https://maven.apache.org/pom.html#Maven_Coordinates | """https://maven.apache.org/pom.html#Maven_Coordinates | ||||
>>> MavenMapping().normalize_groupId('org.example') | >>> MavenMapping().normalize_groupId('org.example') | ||||
{'@id': 'org.example'} | {'@id': 'org.example'} | ||||
""" | """ | ||||
if isinstance(id_, str): | if isinstance(id_, str): | ||||
return {"@id": id_} | return {"@id": id_} | ||||
def parse_licenses(self, d): | def parse_licenses(self, d: Dict[str, Any]) -> Optional[List[Dict[str, str]]]: | ||||
"""https://maven.apache.org/pom.html#Licenses | """https://maven.apache.org/pom.html#Licenses | ||||
>>> import xmltodict | >>> import xmltodict | ||||
>>> import json | >>> import json | ||||
>>> d = xmltodict.parse(''' | >>> d = xmltodict.parse(''' | ||||
... <licenses> | ... <licenses> | ||||
... <license> | ... <license> | ||||
... <name>Apache License, Version 2.0</name> | ... <name>Apache License, Version 2.0</name> | ||||
Show All 31 Lines | def parse_licenses(self, d: Dict[str, Any]) -> Optional[List[Dict[str, str]]]: | ||||
... ''') | ... ''') | ||||
>>> pprint(MavenMapping().parse_licenses(d)) | >>> pprint(MavenMapping().parse_licenses(d)) | ||||
[{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, | [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, | ||||
{'@id': 'https://opensource.org/licenses/MIT'}] | {'@id': 'https://opensource.org/licenses/MIT'}] | ||||
""" | """ | ||||
licenses = d.get("licenses") | licenses = d.get("licenses") | ||||
if not isinstance(licenses, dict): | if not isinstance(licenses, dict): | ||||
return | return None | ||||
licenses = licenses.get("license") | licenses = licenses.get("license") | ||||
if isinstance(licenses, dict): | if isinstance(licenses, dict): | ||||
licenses = [licenses] | licenses = [licenses] | ||||
elif not isinstance(licenses, list): | elif not isinstance(licenses, list): | ||||
return | return None | ||||
return [ | return [ | ||||
{"@id": license["url"]} | {"@id": license["url"]} | ||||
for license in licenses | for license in licenses | ||||
if isinstance(license, dict) and isinstance(license.get("url"), str) | if isinstance(license, dict) and isinstance(license.get("url"), str) | ||||
] or None | ] or None |