Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata_dictionary.py
Show First 20 Lines • Show All 429 Lines • ▼ Show 20 Lines | def translate(self, content): | ||||
self.log.warning('Error unidecoding XML from %s', self.log_suffix) | self.log.warning('Error unidecoding XML from %s', self.log_suffix) | ||||
return None | return None | ||||
except (LookupError, ValueError): | except (LookupError, ValueError): | ||||
# unknown encoding or multi-byte encoding | # unknown encoding or multi-byte encoding | ||||
self.log.warning('Error detecting XML encoding from %s', | self.log.warning('Error detecting XML encoding from %s', | ||||
self.log_suffix) | self.log_suffix) | ||||
return None | return None | ||||
d = self.schema.to_dict(tree, validation='skip') | res = self.schema.to_dict(tree, validation='lax') | ||||
d = d or {} # it may be None if the document is empty but for the root | if res is None: | ||||
d = {} | |||||
else: | |||||
(d, errors) = res # TODO: do something with these errors | |||||
# d may be None if the document is empty but for the root | |||||
d = d or {} | |||||
metadata = self._translate_dict(d, normalize=False) | metadata = self._translate_dict(d, normalize=False) | ||||
metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) | metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) | ||||
metadata[SCHEMA_URI+'license'] = self.parse_licenses(d) | metadata[SCHEMA_URI+'license'] = self.parse_licenses(d) | ||||
return self.normalize_translation(metadata) | return self.normalize_translation(metadata) | ||||
_default_repository = { | _default_repository = { | ||||
POM_PREFIX + 'url': 'https://repo.maven.apache.org/maven2/'} | POM_PREFIX + 'url': 'https://repo.maven.apache.org/maven2/'} | ||||
Show All 13 Lines | def parse_repositories(self, d): | ||||
... </project> | ... </project> | ||||
... ''') | ... ''') | ||||
>>> d = MavenMapping.schema.to_dict(tree) | >>> d = MavenMapping.schema.to_dict(tree) | ||||
>>> MavenMapping().parse_repositories(d) | >>> MavenMapping().parse_repositories(d) | ||||
""" | """ | ||||
repositories = d.get(POM_PREFIX + 'repositories') | repositories = d.get(POM_PREFIX + 'repositories') | ||||
if not repositories: | if not repositories: | ||||
results = [self.parse_repository(d, self._default_repository)] | results = [self.parse_repository(d, self._default_repository)] | ||||
elif isinstance(repositories, dict): | else: | ||||
repositories = repositories.get(POM_PREFIX + 'repository') or [] | repositories = repositories.get(POM_PREFIX + 'repository') or [] | ||||
if not isinstance(repositories, list): | |||||
repositories = [repositories] | |||||
results = [self.parse_repository(d, repo) | results = [self.parse_repository(d, repo) | ||||
for repo in repositories] | for repo in repositories] | ||||
else: | |||||
results = [] | |||||
return [res for res in results if res] or None | return [res for res in results if res] or None | ||||
def parse_repository(self, d, repo): | def parse_repository(self, d, repo): | ||||
if not isinstance(repo, dict): | if not repo: | ||||
return | return | ||||
if repo.get('layout', 'default') != 'default': | if repo.get('layout', 'default') != 'default': | ||||
return # TODO ? | return # TODO ? | ||||
url = repo.get(POM_PREFIX + 'url') | url = repo.get(POM_PREFIX + 'url') | ||||
group_id = d.get(POM_PREFIX + 'groupId') | group_id = d.get(POM_PREFIX + 'groupId') | ||||
artifact_id = d.get(POM_PREFIX + 'artifactId') | artifact_id = d.get(POM_PREFIX + 'artifactId') | ||||
if (isinstance(url, str) and isinstance(group_id, str) | if url and group_id and artifact_id: | ||||
and isinstance(artifact_id, str)): | |||||
repo = os.path.join(url, *group_id.split('.'), artifact_id) | repo = os.path.join(url, *group_id.split('.'), artifact_id) | ||||
return {"@id": repo} | return {"@id": repo} | ||||
def normalize_groupId(self, id_): | def normalize_groupId(self, id_): | ||||
"""https://maven.apache.org/pom.html#Maven_Coordinates | """https://maven.apache.org/pom.html#Maven_Coordinates | ||||
>>> MavenMapping().normalize_groupId('org.example') | >>> MavenMapping().normalize_groupId('org.example') | ||||
{'@id': 'org.example'} | {'@id': 'org.example'} | ||||
""" | """ | ||||
if isinstance(id_, str): | if id_: | ||||
return {"@id": id_} | return {"@id": id_} | ||||
def parse_licenses(self, d): | def parse_licenses(self, d): | ||||
"""https://maven.apache.org/pom.html#Licenses | """https://maven.apache.org/pom.html#Licenses | ||||
>>> import json | >>> import json | ||||
>>> tree = defusedxml.ElementTree.fromstring(''' | >>> tree = defusedxml.ElementTree.fromstring(''' | ||||
... <project xmlns="http://maven.apache.org/POM/4.0.0"> | ... <project xmlns="http://maven.apache.org/POM/4.0.0"> | ||||
Show All 38 Lines | def parse_licenses(self, d): | ||||
... </project> | ... </project> | ||||
... ''') | ... ''') | ||||
>>> d = MavenMapping.schema.to_dict(tree) | >>> d = MavenMapping.schema.to_dict(tree) | ||||
>>> pprint(MavenMapping().parse_licenses(d)) | >>> pprint(MavenMapping().parse_licenses(d)) | ||||
[{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, | [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, | ||||
{'@id': 'https://opensource.org/licenses/MIT'}] | {'@id': 'https://opensource.org/licenses/MIT'}] | ||||
""" # noqa: E501 | """ # noqa: E501 | ||||
licenses = d.get(POM_PREFIX + 'licenses') | licenses = d.get(POM_PREFIX + 'licenses') or {} | ||||
if not isinstance(licenses, dict): | licenses = licenses.get(POM_PREFIX + 'license') or [] | ||||
return | |||||
licenses = licenses.get(POM_PREFIX + 'license') | |||||
if isinstance(licenses, dict): | |||||
licenses = [licenses] | |||||
elif not isinstance(licenses, list): | |||||
return | |||||
return [{"@id": license[POM_PREFIX + 'url']} | return [{"@id": license[POM_PREFIX + 'url']} | ||||
for license in licenses | for license in licenses | ||||
if isinstance(license, dict) | if license[POM_PREFIX + 'url']] or None | ||||
and isinstance(license.get(POM_PREFIX + 'url'), str)] or None | |||||
_normalize_pkginfo_key = str.lower | _normalize_pkginfo_key = str.lower | ||||
class LinebreakPreservingEmailPolicy(email.policy.EmailPolicy): | class LinebreakPreservingEmailPolicy(email.policy.EmailPolicy): | ||||
def header_fetch_parse(self, name, value): | def header_fetch_parse(self, name, value): | ||||
if hasattr(value, 'name'): | if hasattr(value, 'name'): | ||||
▲ Show 20 Lines • Show All 167 Lines • Show Last 20 Lines |