diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -435,8 +435,14 @@ self.log_suffix) return None - d = self.schema.to_dict(tree, validation='skip') - d = d or {} # it may be None if the document is empty but for the root + res = self.schema.to_dict(tree, validation='lax') + if res is None: + d = {} + else: + (d, errors) = res # TODO: do something with these errors + # d may be None if the document is empty but for the root + d = d or {} + metadata = self._translate_dict(d, normalize=False) metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) metadata[SCHEMA_URI+'license'] = self.parse_licenses(d) @@ -466,26 +472,21 @@ repositories = d.get(POM_PREFIX + 'repositories') if not repositories: results = [self.parse_repository(d, self._default_repository)] - elif isinstance(repositories, dict): + else: repositories = repositories.get(POM_PREFIX + 'repository') or [] - if not isinstance(repositories, list): - repositories = [repositories] results = [self.parse_repository(d, repo) for repo in repositories] - else: - results = [] return [res for res in results if res] or None def parse_repository(self, d, repo): - if not isinstance(repo, dict): + if not repo: return if repo.get('layout', 'default') != 'default': return # TODO ? url = repo.get(POM_PREFIX + 'url') group_id = d.get(POM_PREFIX + 'groupId') artifact_id = d.get(POM_PREFIX + 'artifactId') - if (isinstance(url, str) and isinstance(group_id, str) - and isinstance(artifact_id, str)): + if url and group_id and artifact_id: repo = os.path.join(url, *group_id.split('.'), artifact_id) return {"@id": repo} @@ -495,7 +496,7 @@ >>> MavenMapping().normalize_groupId('org.example') {'@id': 'org.example'} """ - if isinstance(id_, str): + if id_: return {"@id": id_} def parse_licenses(self, d): @@ -550,18 +551,11 @@ {'@id': 'https://opensource.org/licenses/MIT'}] """ # noqa: E501 - licenses = d.get(POM_PREFIX + 'licenses') - if not isinstance(licenses, dict): - return - licenses = licenses.get(POM_PREFIX + 'license') - if isinstance(licenses, dict): - licenses = [licenses] - elif not isinstance(licenses, list): - return + licenses = d.get(POM_PREFIX + 'licenses') or {} + licenses = licenses.get(POM_PREFIX + 'license') or [] return [{"@id": license[POM_PREFIX + 'url']} for license in licenses - if isinstance(license, dict) - and isinstance(license.get(POM_PREFIX + 'url'), str)] or None + if license[POM_PREFIX + 'url']] or None _normalize_pkginfo_key = str.lower