Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata_dictionary/maven.py
# Copyright (C) 2018-2019 The Software Heritage developers | # Copyright (C) 2018-2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import os | import os | ||||
from typing import Any, Dict, List, Optional | from typing import Any, Dict, List, Optional, Tuple, Union | ||||
import xml.parsers.expat | import xml.parsers.expat | ||||
import xmltodict | import xmltodict | ||||
from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI | from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI | ||||
from .base import DictMapping, SingleFileMapping | from .base import DictMapping, SingleFileMapping | ||||
class MavenMapping(DictMapping, SingleFileMapping): | class MavenMapping(DictMapping, SingleFileMapping): | ||||
""" | """ | ||||
dedicated class for Maven (pom.xml) mapping and translation | dedicated class for Maven (pom.xml) mapping and translation | ||||
""" | """ | ||||
name = "maven" | name = "maven" | ||||
filename = b"pom.xml" | filename = b"pom.xml" | ||||
mapping = CROSSWALK_TABLE["Java (Maven)"] | mapping = CROSSWALK_TABLE["Java (Maven)"] | ||||
string_fields = ["name", "version", "description", "email"] | string_fields = ["name", "version", "description", "email"] | ||||
def translate(self, content: bytes) -> Optional[Dict[str, Any]]: | def translate( | ||||
self, content: bytes | |||||
) -> Optional[Dict[str, Union[str, List[Any], Dict[str, Any], Tuple[str]]]]: | |||||
try: | try: | ||||
d = xmltodict.parse(content).get("project") or {} | d = xmltodict.parse(content).get("project") or {} | ||||
except xml.parsers.expat.ExpatError: | except xml.parsers.expat.ExpatError: | ||||
self.log.warning("Error parsing XML from %s", self.log_suffix) | self.log.warning("Error parsing XML from %s", self.log_suffix) | ||||
return None | return None | ||||
except UnicodeDecodeError: | except UnicodeDecodeError: | ||||
self.log.warning("Error unidecoding XML from %s", self.log_suffix) | self.log.warning("Error unidecoding XML from %s", self.log_suffix) | ||||
return None | return None | ||||
except (LookupError, ValueError): | except (LookupError, ValueError): | ||||
# unknown encoding or multi-byte encoding | # unknown encoding or multi-byte encoding | ||||
self.log.warning("Error detecting XML encoding from %s", self.log_suffix) | self.log.warning("Error detecting XML encoding from %s", self.log_suffix) | ||||
return None | return None | ||||
metadata = self._translate_dict(d, normalize=False) | metadata = self._translate_dict(d, normalize=False) | ||||
metadata[SCHEMA_URI + "codeRepository"] = self.parse_repositories(d) | metadata[SCHEMA_URI + "codeRepository"] = self.parse_repositories(d) | ||||
metadata[SCHEMA_URI + "license"] = self.parse_licenses(d) | metadata[SCHEMA_URI + "license"] = self.parse_licenses(d) | ||||
return self.normalize_translation(metadata) | return self.normalize_translation(metadata) | ||||
_default_repository = {"url": "https://repo.maven.apache.org/maven2/"} | _default_repository = {"url": "https://repo.maven.apache.org/maven2/"} | ||||
def parse_repositories(self, d: Dict[str, Any]) -> Optional[List[Any]]: | def parse_repositories(self, d: Dict[str, Any]) -> Optional[List[Dict[str, str]]]: | ||||
"""https://maven.apache.org/pom.html#Repositories | """https://maven.apache.org/pom.html#Repositories | ||||
>>> import xmltodict | >>> import xmltodict | ||||
>>> from pprint import pprint | >>> from pprint import pprint | ||||
>>> d = xmltodict.parse(''' | >>> d = xmltodict.parse(''' | ||||
... <repositories> | ... <repositories> | ||||
... <repository> | ... <repository> | ||||
... <id>codehausSnapshots</id> | ... <id>codehausSnapshots</id> | ||||
Show All 12 Lines | def parse_repositories(self, d: Dict[str, Any]) -> Optional[List[Dict[str, str]]]: | ||||
repositories = repositories.get("repository") or [] | repositories = repositories.get("repository") or [] | ||||
if not isinstance(repositories, list): | if not isinstance(repositories, list): | ||||
repositories = [repositories] | repositories = [repositories] | ||||
results = [self.parse_repository(d, repo) for repo in repositories] | results = [self.parse_repository(d, repo) for repo in repositories] | ||||
else: | else: | ||||
results = [] | results = [] | ||||
return [res for res in results if res] or None | return [res for res in results if res] or None | ||||
def parse_repository(self, d: Dict[str, Any], repo: Dict[str, Any]) -> Any: | def parse_repository( | ||||
self, d: Dict[str, Any], repo: Dict[str, Any] | |||||
) -> Optional[Dict[str, str]]: | |||||
if not isinstance(repo, dict): | if not isinstance(repo, dict): | ||||
return | return None | ||||
if repo.get("layout", "default") != "default": | if repo.get("layout", "default") != "default": | ||||
return None # TODO ? | return None # TODO ? | ||||
url = repo.get("url") | url = repo.get("url") | ||||
group_id = d.get("groupId") | group_id = d.get("groupId") | ||||
artifact_id = d.get("artifactId") | artifact_id = d.get("artifactId") | ||||
if ( | if ( | ||||
isinstance(url, str) | isinstance(url, str) | ||||
and isinstance(group_id, str) | and isinstance(group_id, str) | ||||
and isinstance(artifact_id, str) | and isinstance(artifact_id, str) | ||||
): | ): | ||||
return {"@id": os.path.join(url, *group_id.split("."), artifact_id)} | return {"@id": os.path.join(url, *group_id.split("."), artifact_id)} | ||||
else: | |||||
return None | |||||
def normalize_groupId(self, id_: str) -> Dict[str, str]: | def normalize_groupId(self, id_: str) -> Dict[str, str]: | ||||
"""https://maven.apache.org/pom.html#Maven_Coordinates | """https://maven.apache.org/pom.html#Maven_Coordinates | ||||
>>> MavenMapping().normalize_groupId('org.example') | >>> MavenMapping().normalize_groupId('org.example') | ||||
{'@id': 'org.example'} | {'@id': 'org.example'} | ||||
""" | """ | ||||
if isinstance(id_, str): | if isinstance(id_, str): | ||||
▲ Show 20 Lines • Show All 61 Lines • Show Last 20 Lines |