diff --git a/swh/indexer/metadata_dictionary/base.py b/swh/indexer/metadata_dictionary/base.py --- a/swh/indexer/metadata_dictionary/base.py +++ b/swh/indexer/metadata_dictionary/base.py @@ -159,9 +159,7 @@ return simple_terms | complex_terms - def _translate_dict( - self, content_dict: Dict, *, normalize: bool = True - ) -> Dict[str, str]: + def _translate_dict(self, content_dict: Dict) -> Dict[str, str]: """ Translates content by parsing content from a dict object and translating with the appropriate mapping @@ -209,10 +207,15 @@ else: translated_metadata[codemeta_key] = v - if normalize: - return self.normalize_translation(translated_metadata) - else: - return translated_metadata + self.extra_translation(translated_metadata, content_dict) + + return self.normalize_translation(translated_metadata) + + def extra_translation(self, translated_metadata: Dict[str, Any], d: Dict[str, Any]): + """Called at the end of the translation process, and may add arbitrary keys + to ``translated_metadata`` based on the input dictionary (passed as ``d``). + """ + pass class JsonMapping(DictMapping): diff --git a/swh/indexer/metadata_dictionary/maven.py b/swh/indexer/metadata_dictionary/maven.py --- a/swh/indexer/metadata_dictionary/maven.py +++ b/swh/indexer/metadata_dictionary/maven.py @@ -41,13 +41,15 @@ if not isinstance(d, dict): self.log.warning("Skipping ill-formed XML content: %s", content) return None - metadata = self._translate_dict(d, normalize=False) - metadata[SCHEMA.codeRepository] = self.parse_repositories(d) - metadata[SCHEMA.license] = self.parse_licenses(d) - return self.normalize_translation(metadata) + return self._translate_dict(d) _default_repository = {"url": "https://repo.maven.apache.org/maven2/"} + def extra_translation(self, translated_metadata, d): + repositories = self.parse_repositories(d) + if repositories: + translated_metadata[SCHEMA.codeRepository] = repositories + def parse_repositories(self, d): """https://maven.apache.org/pom.html#Repositories @@ -102,7 +104,12 @@ if isinstance(id_, str): return {"@id": id_} - def parse_licenses(self, d): + def translate_licenses(self, translated_metadata, d): + licenses = self.parse_licenses(d) + if licenses: + translated_metadata[SCHEMA.license] = licenses + + def parse_licenses(self, licenses): """https://maven.apache.org/pom.html#Licenses >>> import xmltodict @@ -124,7 +131,7 @@ } } } - >>> MavenMapping().parse_licenses(d) + >>> MavenMapping().parse_licenses(d["licenses"]) [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}] or, if there are more than one license: @@ -143,12 +150,11 @@ ... ... ... ''') - >>> pprint(MavenMapping().parse_licenses(d)) + >>> pprint(MavenMapping().parse_licenses(d["licenses"])) [{'@id': 'https://www.apache.org/licenses/LICENSE-2.0.txt'}, {'@id': 'https://opensource.org/licenses/MIT'}] """ - licenses = d.get("licenses") if not isinstance(licenses, dict): return licenses = licenses.get("license") diff --git a/swh/indexer/metadata_dictionary/python.py b/swh/indexer/metadata_dictionary/python.py --- a/swh/indexer/metadata_dictionary/python.py +++ b/swh/indexer/metadata_dictionary/python.py @@ -52,12 +52,13 @@ key = _normalize_pkginfo_key(key) if value != "UNKNOWN": d.setdefault(key, []).append(value) - metadata = self._translate_dict(d, normalize=False) + return self._translate_dict(d) - author_name = metadata.pop(SCHEMA.author, None) - author_email = metadata.pop(SCHEMA.email, None) + def extra_translation(self, translated_metadata, d): + author_name = translated_metadata.pop(SCHEMA.author, None) + author_email = translated_metadata.pop(SCHEMA.email, None) if author_name or author_email: - metadata[SCHEMA.author] = { + translated_metadata[SCHEMA.author] = { "@list": [ { "@type": SCHEMA.Person, @@ -66,7 +67,6 @@ } ] } - return self.normalize_translation(metadata) def normalize_home_page(self, urls): return [{"@id": url} for url in urls]