Changeset View
Standalone View
swh/indexer/metadata_dictionary/nuget.py
- This file was added.
# Copyright (C) 2022 The Software Heritage developers | ||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | ||||||||||||
# License: GNU General Public License version 3, or any later version | ||||||||||||
# See top-level LICENSE file for more information | ||||||||||||
import os.path | ||||||||||||
import re | ||||||||||||
from typing import Any, Dict, List, Optional | ||||||||||||
import xmltodict | ||||||||||||
from swh.indexer.codemeta import _DATA_DIR, SCHEMA_URI, _read_crosstable | ||||||||||||
from swh.indexer.storage.interface import Sha1 | ||||||||||||
from .base import DictMapping, DirectoryLsEntry, SingleFileIntrinsicMapping | ||||||||||||
NUGET_TABLE_PATH = os.path.join(_DATA_DIR, "nuget.csv") | ||||||||||||
with open(NUGET_TABLE_PATH) as fd: | ||||||||||||
(CODEMETA_TERMS, NUGET_TABLE) = _read_crosstable(fd) | ||||||||||||
class NuGetMapping(DictMapping, SingleFileIntrinsicMapping): | ||||||||||||
KShivendu: I think this should be `... for Nuget (.nuspec) mapping ...` | ||||||||||||
""" | ||||||||||||
dedicated class for NuGet (.nuspec) mapping and translation | ||||||||||||
""" | ||||||||||||
Done Inline ActionsIsn't .nuspec the extension? vlorentz: Isn't `.nuspec` the extension? | ||||||||||||
Done Inline Actionsoh, yes. Do I use a regex then? VickyMerzOwn: oh, yes. Do I use a regex then? | ||||||||||||
Done Inline Actionsyou can't use SingleFileIntrinsicMapping, so you have to reimplement its file-detection method vlorentz: you can't use `SingleFileIntrinsicMapping`, so you have to reimplement its file-detection method | ||||||||||||
name = "nuget" | ||||||||||||
mapping = NUGET_TABLE["NuGet"] | ||||||||||||
mapping["copyright"] = "http://schema.org/copyrightNotice" | ||||||||||||
mapping["language"] = "http://schema.org/inLanguage" | ||||||||||||
string_fields = [ | ||||||||||||
"description", | ||||||||||||
"version", | ||||||||||||
Done Inline Actionsnot anymore vlorentz: not anymore | ||||||||||||
"projectUrl", | ||||||||||||
"name", | ||||||||||||
"tags", | ||||||||||||
"license", | ||||||||||||
Done Inline Actions.get("package") defaults to None instead of {} so .get("metadata") won't work if package isn't found in the parsed dictionary. It's interesting that mypy didn't point out this. Please use .get("package", {}) instead. Also, why did we drop the try/except blocks? KShivendu: `.get("package")` defaults to `None` instead of `{}` so `.get("metadata")` won't work if… | ||||||||||||
Done Inline ActionsI think you mean .get("package").get("metadata", {}) to simplify using .get()'s parameter VickyMerzOwn: I think you mean `.get("package").get("metadata", {})` to simplify using `.get()`'s parameter… | ||||||||||||
Done Inline Actionsif xmltodict.parse(content.strip(b" \n ")) returns a dict without the package key, then .get("package") returns None, so you call None.get("metadata", {}) which doesn't exist. vlorentz: if `xmltodict.parse(content.strip(b" \n "))` returns a dict without the `package` key, then `. | ||||||||||||
"licenseUrl", | ||||||||||||
"summary", | ||||||||||||
"copyright", | ||||||||||||
Done Inline Actions
Unlike maven, we aren't performing any operation after _translate_dict. So we can directly use normalization (i.e. jsonld compaction) You can just directly use return self._translate_dict(d) (since normalize=True is the default) Skim through the code of self._translate_dict to learn more. KShivendu: Unlike maven, we aren't performing any operation after `_translate_dict`. So we can directly… | ||||||||||||
Done Inline Actionsoh, yeah! makes sense. Updating it.. VickyMerzOwn: oh, yeah! makes sense. Updating it..
| ||||||||||||
"language", | ||||||||||||
Not Done Inline Actionsplease add a test for that vlorentz: please add a test for that | ||||||||||||
Not Done Inline ActionsI'm not sure how I can test for this. Thinking about it VickyMerzOwn: I'm not sure how I can test for this. Thinking about it | ||||||||||||
Not Done Inline Actionstry using unitest test suites. it provides you assertLogs which you can use like this: python from unitest import TestCase class SomeTests(TestCase): def test_something(): with self.assertLogs(level="DEBUG") as captured: assert captured.records[0] == "..." there are other ways of doing the same thing. iirc, I did it once with a cleaner approach. so feel free to explore further. KShivendu: try using `unitest` test suites. it provides you `assertLogs` which you can use like this… | ||||||||||||
Not Done Inline Actionsno, we use pytest instead of unittest now. See https://docs.pytest.org/en/6.2.x/logging.html for testing logs. vlorentz: no, we use pytest instead of unittest now. See https://docs.pytest.org/en/6.2.x/logging.html… | ||||||||||||
Done Inline ActionsI'm still not clear how xmltodict.parse can output a non dict object. I added the check initially from MavenMapping's translate method. VickyMerzOwn: I'm still not clear how xmltodict.parse can output a non dict object. I added the check… | ||||||||||||
] | ||||||||||||
@classmethod | ||||||||||||
def detect_metadata_files(cls, file_entries: List[DirectoryLsEntry]) -> List[Sha1]: | ||||||||||||
for entry in file_entries: | ||||||||||||
if entry["name"].endswith(b".nuspec"): | ||||||||||||
Done Inline ActionsWe use isinstance(v, dict) everywhere. Please follow that. KShivendu: We use `isinstance(v, dict)` everywhere. Please follow that. | ||||||||||||
return [entry["sha1"]] | ||||||||||||
return [] | ||||||||||||
def translate(self, content: bytes) -> Optional[Dict[str, Any]]: | ||||||||||||
d = ( | ||||||||||||
xmltodict.parse(content.strip(b" \n ")) | ||||||||||||
Done Inline ActionsWe use isinstance(v, dict) everywhere. Please follow that. KShivendu: We use `isinstance(v, dict)` everywhere. Please follow that. | ||||||||||||
.get("package", {}) | ||||||||||||
Done Inline ActionsCan @url be absent? vlorentz: Can `@url` be absent? | ||||||||||||
Done Inline ActionsYes. I'll change that so the translation will happen only if @url is present. VickyMerzOwn: Yes. I'll change that so the translation will happen only if `@url` is present. | ||||||||||||
.get("metadata", {}) | ||||||||||||
) | ||||||||||||
if not isinstance(d, dict): | ||||||||||||
self.log.warning("Skipping ill-formed XML content: %s", content) | ||||||||||||
return None | ||||||||||||
Done Inline Actionsnitpick: You don't need to pass " " to strip. It's the default value. KShivendu: nitpick: You don't need to pass `" "` to `strip`. It's the default value. | ||||||||||||
Done Inline ActionsIt isn't: >>> "foo bar ".split() ['foo', 'bar'] >>> "foo bar ".split(" ") ['foo', '', 'bar', ''] See https://docs.python.org/3/library/stdtypes.html#str.split vlorentz: It isn't:
```
>>> "foo bar ".split()
['foo', 'bar']
>>> "foo bar ".split(" ")
['foo', ''… | ||||||||||||
Done Inline ActionsI think there's a small misunderstanding here. he's using strip not split I also got confused when I first saw it :p KShivendu: I think there's a small misunderstanding here.
he's using `strip` not `split`
I also got… | ||||||||||||
Done Inline Actionsah yes, my bad. strip removes all whitespaces by default though; but that's probably a good behavior anyway. vlorentz: ah yes, my bad. `strip` removes all whitespaces by default though; but that's probably a good… | ||||||||||||
Done Inline Actionscan #text be absent? vlorentz: can `#text` be absent? | ||||||||||||
Done Inline ActionsNo. if v.get("@type") == "expression" then #text is always present. VickyMerzOwn: No. if `v.get("@type") == "expression"` then `#text` is always present. | ||||||||||||
Done Inline Actionsthen use ["text"] instead of .get("text"). (and elsewhere too) vlorentz: then use `["text"]` instead of `.get("text")`. (and elsewhere too) | ||||||||||||
return self._translate_dict(d) | ||||||||||||
def normalize_projectUrl(self, s): | ||||||||||||
if isinstance(s, str): | ||||||||||||
return {"@id": s} | ||||||||||||
def translate_repository(self, translated_metadata, v): | ||||||||||||
if isinstance(v, dict) and isinstance(v["@url"], str): | ||||||||||||
codemeta_key = self.mapping["repository.url"] | ||||||||||||
Done Inline Actionsyou can use a list comprehension to rewrite this as a single line vlorentz: you can use a list comprehension to rewrite this as a single line | ||||||||||||
translated_metadata[codemeta_key] = {"@id": v["@url"]} | ||||||||||||
Not Done Inline Actionsremove this statement, the return value is not used. vlorentz: remove this statement, the return value is not used. | ||||||||||||
def normalize_license(self, v): | ||||||||||||
if isinstance(v, dict) and v["@type"] == "expression": | ||||||||||||
license_string = v["#text"] | ||||||||||||
if not bool( | ||||||||||||
re.search(r" with |\(|\)| and ", license_string, re.IGNORECASE) | ||||||||||||
): | ||||||||||||
return [ | ||||||||||||
{"@id": "https://spdx.org/licenses/" + license_type.strip()} | ||||||||||||
for license_type in re.split( | ||||||||||||
r" or ", license_string, flags=re.IGNORECASE | ||||||||||||
) | ||||||||||||
] | ||||||||||||
else: | ||||||||||||
return None | ||||||||||||
def normalize_licenseUrl(self, s): | ||||||||||||
if isinstance(s, str): | ||||||||||||
return {"@id": s} | ||||||||||||
def normalize_authors(self, s): | ||||||||||||
if isinstance(s, str): | ||||||||||||
author_names = [a.strip() for a in s.split(",")] | ||||||||||||
authors = [ | ||||||||||||
{"@type": SCHEMA_URI + "Person", SCHEMA_URI + "name": name} | ||||||||||||
for name in author_names | ||||||||||||
Not Done Inline ActionsThis is incorrect as s is not an URL. Unfortunately, until https://github.com/codemeta/codemeta/pull/280 gets merged, this means we cannot use codemeta's restriction of http://schema.org/releaseNotes, so we have to use http://schema.org/releaseNotes directly. I'll spare you the back-and-forth, this is how it should be done: def translate_releaseNotes(self, translated_metadata, s): if isinstance(s, str): translated_metadata.setdefault("http://schema.org/releaseNotes", []).append(s) and also, you can remove releaseNotes from string_fields above, it's redundant when either normalize_releaseNotes or translate_releaseNotes is present. vlorentz: This is incorrect as `s` is not an URL.
Unfortunately, until https://github. | ||||||||||||
] | ||||||||||||
return {"@list": authors} | ||||||||||||
def translate_releaseNotes(self, translated_metadata, s): | ||||||||||||
if isinstance(s, str): | ||||||||||||
translated_metadata.setdefault("http://schema.org/releaseNotes", []).append( | ||||||||||||
s | ||||||||||||
) | ||||||||||||
def normalize_tags(self, s): | ||||||||||||
if isinstance(s, str): | ||||||||||||
return s.split(" ") |
I think this should be ... for Nuget (.nuspec) mapping ...