Changeset View
Standalone View
swh/loader/package/hackage/loader.py
- This file was added.
# Copyright (C) 2022 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
from distutils.version import LooseVersion | |||||
import json | |||||
from pathlib import Path | |||||
from typing import Any, Dict, Iterator, Optional, Sequence, Tuple | |||||
import attr | |||||
from swh.loader.package.loader import BasePackageInfo, PackageLoader | |||||
from swh.loader.package.utils import Person, get_url_body, release_name | |||||
anlambert: `api_info` has been renamed to `get_url_body` | |||||
from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone | |||||
from swh.storage.interface import StorageInterface | |||||
@attr.s | |||||
class HackagePackageInfo(BasePackageInfo): | |||||
name = attr.ib(type=str) | |||||
"""Name of the package""" | |||||
version = attr.ib(type=str) | |||||
"""Current version""" | |||||
last_modified = attr.ib(type=str) | |||||
"""File last modified date as release date""" | |||||
def extract_intrinsic_metadata(dir_path: Path, pkgname: str) -> Dict[str, Any]: | |||||
"""Extract intrinsic metadata from {pkgname}.cabal file at dir_path. | |||||
Each Haskell package version has a {pkgname}.cabal file at the root of the archive. | |||||
See https://cabal.readthedocs.io/en/3.4/cabal-package.html#package-properties for | |||||
package properties specifications. | |||||
Args: | |||||
dir_path: A directory on disk where a {pkgname}.cabal must be present | |||||
Returns: | |||||
A dict mapping with 'name', 'version', 'synopsis' and 'author' keys | |||||
""" | |||||
cabal_path = dir_path / f"{pkgname}.cabal" | |||||
content = cabal_path.read_text() | |||||
lines = content.splitlines() | |||||
result: dict = {} | |||||
keys = ["name:", "version:", "synopsis:", "author:"] | |||||
for line in lines: | |||||
for key in keys: | |||||
if line.startswith(key): | |||||
k = key.split(":")[0] | |||||
v = line.split(key)[1] | |||||
result[k] = v.lstrip().rstrip() | |||||
keys.remove(key) | |||||
# Cabal file keys may be capitalized | |||||
elif line.startswith(key.capitalize()): | |||||
k = key.capitalize().split(":")[0] | |||||
v = line.split(key.capitalize())[1] | |||||
result[k.lower()] = v.lstrip().rstrip() | |||||
keys.remove(key) | |||||
if not keys: | |||||
break | |||||
return result | |||||
Done Inline ActionsI don't understand why this loop is designed this way. First, it assumes field names are either lower-case or capitalized, but "Case is not significant in field names, but is significant in field values." according to https://cabal.readthedocs.io/en/3.4/cabal-package.html#package-descriptions Also I find the logic of checking keys one by one with both forms, and using them to split is confusing. I think you can simplify the loop to something like this: keys = ["name", "version", "synopsis", "author", "description"] for line in lines: (key, value) = line.split(":", 1) key = key.lower() if key in keys: result[key] = value.strip() keys.remove(key) Finally, it does not support multi-line values, which seem to be common for descriptions (and are valid as synopsis too, I assume) vlorentz: I don't understand why this loop is designed this way.
First, it assumes field names are… | |||||
Done Inline Actions
ok, simpler. I added a check against ":" if not it failed on comment and multilines
Last commit fix the issues. By the way please note that "synopsis" is the field used on the forge to display a title and description is more like a README content. The current implementation use synopsis and fallback to description if any, and description may be multiline. Can you confirm its ok this way or do you prefer we use description as a default? Also description is the only multiline field that we need to manage. franckbret: > I don't understand why this loop is designed this way.
>
> First, it assumes field names are… | |||||
Done Inline ActionsThat code still does not parse multiline values, for instance we would miss full description by parsing that cabal file. You should add tests for that function to cover the edge cases. anlambert: That code still does not parse multiline values, for instance we would miss full description by… | |||||
class HackageLoader(PackageLoader[HackagePackageInfo]): | |||||
visit_type = "hackage" | |||||
def __init__( | |||||
self, | |||||
storage: StorageInterface, | |||||
url: str, | |||||
**kwargs, | |||||
): | |||||
super().__init__(storage=storage, url=url, **kwargs) | |||||
self.url = url | |||||
def _raw_info(self, url: str, **extra_params) -> bytes: | |||||
return get_url_body(url=url, **extra_params) | |||||
Done Inline ActionsUse this instead: def _raw_info(self, url: str, **extra_params) -> bytes: return get_url_body(url=url, **extra_params) anlambert: Use this instead:
```lang=python
def _raw_info(self, url: str, **extra_params) -> bytes… | |||||
def info_versions(self) -> Dict: | |||||
"""Return the package versions (fetched from | |||||
https://hackage.haskell.org/package/{pkgname}) | |||||
Api documentation https://hackage.haskell.org/api | |||||
""" | |||||
return json.loads( | |||||
self._raw_info(url=self.url, headers={"Accept": "application/json"}) | |||||
) | |||||
Done Inline Actionsself._raw_info( url=self.url, headers={"Accept": "application/json"} ) anlambert: ```lang=python
self._raw_info(
url=self.url, headers={"Accept": "application/json"}
)
``` | |||||
def info_revisions(self, url) -> Dict: | |||||
"""Return the package version revisions (fetched from | |||||
https://hackage.haskell.org/package/{pkgname}-{version}/revisions/) | |||||
Api documentation https://hackage.haskell.org/api | |||||
""" | |||||
return json.loads( | |||||
self._raw_info(url=url, headers={"Accept": "application/json"}) | |||||
) | |||||
def get_versions(self) -> Sequence[str]: | |||||
"""Get all released versions of an Haskell package | |||||
Done Inline Actionsditto anlambert: ditto | |||||
Returns: | |||||
A sequence of versions | |||||
Example:: | |||||
["0.1.1", "0.10.2"] | |||||
""" | |||||
versions = list(self.info_versions().keys()) | |||||
versions.sort(key=LooseVersion) | |||||
return versions | |||||
def get_default_version(self) -> str: | |||||
"""Get the newest release version of an Haskell package | |||||
Returns: | |||||
A string representing a version | |||||
Example:: | |||||
"0.10.2" | |||||
""" | |||||
return self.get_versions()[-1] | |||||
def get_package_info( | |||||
self, version: str | |||||
) -> Iterator[Tuple[str, HackagePackageInfo]]: | |||||
"""Get release name and package information from version | |||||
Args: | |||||
version: Package version (e.g: "0.1.0") | |||||
Returns: | |||||
Iterator of tuple (release_name, p_info) | |||||
""" | |||||
pkgname: str = self.url.split("/")[-1] | |||||
url: str = ( | |||||
f"https://hackage.haskell.org/package/" | |||||
f"{pkgname}-{version}/{pkgname}-{version}.tar.gz" | |||||
) | |||||
filename: str = url.split("/")[-1] | |||||
# Retrieve version revisions | |||||
revisions_url: str = ( | |||||
f"https://hackage.haskell.org/package/{pkgname}-{version}/revisions/" | |||||
) | |||||
revisions = self.info_revisions(revisions_url) | |||||
revisions_time = [item["time"] for item in revisions] | |||||
last_modified = max(revisions_time) | |||||
Done Inline ActionsYou can concatenate the f-strings anlambert: You can concatenate the f-strings | |||||
p_info = HackagePackageInfo( | |||||
name=pkgname, | |||||
filename=filename, | |||||
url=url, | |||||
version=version, | |||||
last_modified=last_modified, | |||||
) | |||||
Not Done Inline Actionscould be simplified to: last_modified = max(item["time"] for item in revisions) anlambert: could be simplified to:
```lang=python
last_modified = max(item["time"] for item in revisions)… | |||||
yield release_name(version), p_info | |||||
def build_release( | |||||
self, p_info: HackagePackageInfo, uncompressed_path: str, directory: Sha1Git | |||||
) -> Optional[Release]: | |||||
# Extract intrinsic metadata from uncompressed_path/{pkgname}-{version}.cabal | |||||
intrinsic_metadata = extract_intrinsic_metadata( | |||||
Path(uncompressed_path) / f"{p_info.name}-{p_info.version}", p_info.name | |||||
) | |||||
version: str = intrinsic_metadata["version"] | |||||
assert version == p_info.version | |||||
anlambertUnsubmitted Done Inline ActionsHow about using p_info.version directly here and stop asserting ? There's good chances that some intrinsic_metadata contain mistakes as some humans wrote them, anlambert: How about using `p_info.version` directly here and stop asserting ?
There's good chances that… | |||||
franckbretAuthorUnsubmitted Done Inline ActionsLast commit goes this way and the last commented test pass. Even if the cabal file is not parsed correctly, we have acceptable data to load the archive. franckbret: Last commit goes this way and the last commented test pass. Even if the cabal file is not… | |||||
Done Inline ActionsGot the following warnings when testing the loader: docker-swh-loader-1 | [2022-10-19 15:43:19,789: DEBUG/ForkPoolWorker-32] Fetching https://hackage.haskell.org/package/hoodle docker-swh-loader-1 | [2022-10-19 15:43:19,928: DEBUG/ForkPoolWorker-32] Fetching https://hackage.haskell.org/package/hoodle-0.1/revisions/ docker-swh-loader-1 | [2022-10-19 15:43:20,211: WARNING/ForkPoolWorker-32] Can not decode md5 checksum b'y\x0b\x141S\x14\x93%\xa8\xe5s\x87!W_\xad' for 'https://hackage.haskell.org/package/hoodle-0.1/hoodle-0.1.tar.gz' You need to use this instead to get the hexadecimal representation of the md5 hash: from swh.model.hashutil import hash_to_hex checksums = {"md5": hash_to_hex(md5)} anlambert: Got the following warnings when testing the loader:
```
docker-swh-loader-1 | [2022-10-19 15… | |||||
Done Inline ActionsThanks I was not sure about how to deal with that franckbret: Thanks I was not sure about how to deal with that | |||||
author = Person.from_fullname(intrinsic_metadata["author"].encode()) | |||||
description: str = intrinsic_metadata["synopsis"] | |||||
message = ( | |||||
f"Synthetic release for Haskell source package {p_info.name} " | |||||
f"version {version}\n\n" | |||||
f"{description}\n" | |||||
) | |||||
return Release( | |||||
name=version.encode(), | |||||
author=author, | |||||
date=TimestampWithTimezone.from_iso8601(p_info.last_modified), | |||||
message=message.encode(), | |||||
target_type=ObjectType.DIRECTORY, | |||||
target=directory, | |||||
synthetic=True, | |||||
) | |||||
Done Inline ActionsNitpicks for shorter code and empty description handling. author_str = intrinsic_metdata.get("author") or p_info.author author = Person.from_fullname(author_str.encode()) description = intrinsic_metadata.get("synopsis", intrinsic_metadata.get("description")) message = ( f"Synthetic release for Haskell source package {p_info.name} " f"version {p_info.version}" ) if description: message += f"\n\n{description}" anlambert: Nitpicks for shorter code and empty description handling.
```lang=python
author_str =… | |||||
Done Inline ActionsDo agree for description part, but for author the problem is that p_info.author is of type Person, so I let it like this. franckbret: Do agree for description part, but for author the problem is that p_info.author is of type… | |||||
Done Inline ActionsAh right, use this instead then: author_str = intrinsic_metdata.get("author") author = Person.from_fullname(author_str.encode()) if author_str else p_info.author description = intrinsic_metadata.get("synopsis", intrinsic_metadata.get("description")) message = ( f"Synthetic release for Haskell source package {p_info.name} " f"version {p_info.version}\n" ) if description: message += f"\n{description}" anlambert: Ah right, use this instead then:
```lang=python
author_str = intrinsic_metdata.get("author")… | |||||
Done Inline Actionsdescription has been removed franckbret: description has been removed |
api_info has been renamed to get_url_body