Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/conda/loader.py
- This file was added.
# Copyright (C) 2022 The Software Heritage developers | |||||||||||||||||||
# See the AUTHORS file at the top-level directory of this distribution | |||||||||||||||||||
# License: GNU General Public License version 3, or any later version | |||||||||||||||||||
# See top-level LICENSE file for more information | |||||||||||||||||||
import json | |||||||||||||||||||
from pathlib import Path | |||||||||||||||||||
from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple | |||||||||||||||||||
import attr | |||||||||||||||||||
import iso8601 | |||||||||||||||||||
from packaging.version import parse as parse_version | |||||||||||||||||||
import yaml | |||||||||||||||||||
from swh.loader.package.loader import BasePackageInfo, PackageLoader | |||||||||||||||||||
from swh.loader.package.utils import EMPTY_AUTHOR, Person, get_url_body, release_name | |||||||||||||||||||
from swh.model.model import ObjectType, Release, Sha1Git, TimestampWithTimezone | |||||||||||||||||||
from swh.storage.interface import StorageInterface | |||||||||||||||||||
@attr.s | |||||||||||||||||||
class CondaPackageInfo(BasePackageInfo): | |||||||||||||||||||
name = attr.ib(type=str) | |||||||||||||||||||
"""Name of the package""" | |||||||||||||||||||
filename = attr.ib(type=str) | |||||||||||||||||||
"""Archive (tar.gz) file name""" | |||||||||||||||||||
version = attr.ib(type=str) | |||||||||||||||||||
"""Complete version and distribution name. Ex: 'linux-64/0.1.1-py37' | |||||||||||||||||||
""" | |||||||||||||||||||
last_modified = attr.ib(type=str) | |||||||||||||||||||
"""File last modified date as release date""" | |||||||||||||||||||
def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: | |||||||||||||||||||
"""Extract intrinsic metadata from file at dir_path. | |||||||||||||||||||
Each Conda package version may have an info/about.json file in the | |||||||||||||||||||
archive. If missing we try to get metadata from info/recipe/meta.yaml | |||||||||||||||||||
See https://docs.conda.io/projects/conda/en/latest/user-guide/concepts/pkg-specs.html?highlight=meta.yaml#info-about-json # noqa: B950 | |||||||||||||||||||
for package specifications. | |||||||||||||||||||
Args: | |||||||||||||||||||
dir_path: A directory on disk where a metadata file can be found | |||||||||||||||||||
Returns: | |||||||||||||||||||
A dict mapping from yaml parser | |||||||||||||||||||
""" | |||||||||||||||||||
metadata: Dict[str, Any] = {} | |||||||||||||||||||
meta_json_path = dir_path / "info" / "about.json" | |||||||||||||||||||
if meta_json_path.exists(): | |||||||||||||||||||
metadata = json.loads(meta_json_path.read_text()) | |||||||||||||||||||
meta_yml_path = dir_path / "info" / "recipe" / "meta.yaml" | |||||||||||||||||||
if meta_yml_path.exists(): | |||||||||||||||||||
metadata = yaml.safe_load(meta_yml_path.read_text()) | |||||||||||||||||||
anlambert: You should return the first metadata file content you find here:
```lang=python
meta_json_path… | |||||||||||||||||||
return metadata | |||||||||||||||||||
Done Inline ActionsWe should return the first parsed metadata file and handle parsing errors here: metadata: Dict[str, Any] = {} meta_json_path = dir_path / "info" / "about.json" meta_yml_path = dir_path / "info" / "recipe" / "meta.yaml" if meta_json_path.exists(): try: metadata = json.loads(meta_json_path.read_text()) except json.JSONDecodeError: pass if meta_yml_path.exists() and not metadata: try: metadata = yaml.safe_load(meta_yml_path.read_text()) except yaml.YAMLError: pass return metadata anlambert: We should return the first parsed metadata file and handle parsing errors here:
```lang=python… | |||||||||||||||||||
class CondaLoader(PackageLoader[CondaPackageInfo]): | |||||||||||||||||||
visit_type = "conda" | |||||||||||||||||||
def __init__( | |||||||||||||||||||
self, | |||||||||||||||||||
storage: StorageInterface, | |||||||||||||||||||
url: str, | |||||||||||||||||||
artifacts: List[Dict[str, Any]], | |||||||||||||||||||
**kwargs, | |||||||||||||||||||
): | |||||||||||||||||||
super().__init__(storage=storage, url=url, **kwargs) | |||||||||||||||||||
self.url = url | |||||||||||||||||||
self.artifacts: Dict[str, Dict] = { | |||||||||||||||||||
artifact["version"]: artifact for artifact in artifacts | |||||||||||||||||||
} | |||||||||||||||||||
def _raw_info(self, url: str, **extra_params) -> bytes: | |||||||||||||||||||
return get_url_body(url=url, **extra_params) | |||||||||||||||||||
def get_versions(self) -> Sequence[str]: | |||||||||||||||||||
"""Get all released versions of a Conda package | |||||||||||||||||||
Returns: | |||||||||||||||||||
A sequence of versions | |||||||||||||||||||
Example:: | |||||||||||||||||||
["0.1.1", "0.10.2"] | |||||||||||||||||||
""" | |||||||||||||||||||
versions = list(self.artifacts.keys()) | |||||||||||||||||||
versions.sort(key=parse_version) | |||||||||||||||||||
return versions | |||||||||||||||||||
def get_default_version(self) -> str: | |||||||||||||||||||
"""Get the newest release version of a Conda package | |||||||||||||||||||
Returns: | |||||||||||||||||||
A string representing a version | |||||||||||||||||||
Example:: | |||||||||||||||||||
"0.10.2" | |||||||||||||||||||
""" | |||||||||||||||||||
return self.get_versions()[-1] | |||||||||||||||||||
def get_package_info(self, version: str) -> Iterator[Tuple[str, CondaPackageInfo]]: | |||||||||||||||||||
"""Get release name and package information from version | |||||||||||||||||||
Args: | |||||||||||||||||||
version: Package version (e.g: "0.1.0") | |||||||||||||||||||
Returns: | |||||||||||||||||||
Iterator of tuple (release_name, p_info) | |||||||||||||||||||
""" | |||||||||||||||||||
data = self.artifacts[version] | |||||||||||||||||||
pkgname: str = self.url.split("/")[-1] | |||||||||||||||||||
url: str = data["url"] | |||||||||||||||||||
filename: str = data["filename"] | |||||||||||||||||||
last_modified = iso8601.parse_date(data["date"]).isoformat() | |||||||||||||||||||
Done Inline Actionsdate can be missing from listed artifacts, this needs to be handled: [2022-10-19 13:52:59,918: ERROR/ForkPoolWorker-4] Failed to get package info for version linux-64/0.20.0-py27_0 of https://anaconda.org/main/llvmlite docker-swh-loader-1 | Traceback (most recent call last): docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/loader.py", line 634, in load docker-swh-loader-1 | for branch_name, p_info in self.get_package_info(version): docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/conda/loader.py", line 125, in get_package_info docker-swh-loader-1 | last_modified = iso8601.parse_date(data["date"]).isoformat() docker-swh-loader-1 | KeyError: 'date' docker-swh-loader-1 | [2022-10-19 13:52:59,919: ERROR/ForkPoolWorker-4] Failed to get package info for version linux-64/0.20.0-py34_0 of https://anaconda.org/main/llvmlite docker-swh-loader-1 | Traceback (most recent call last): docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/loader.py", line 634, in load docker-swh-loader-1 | for branch_name, p_info in self.get_package_info(version): docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/conda/loader.py", line 125, in get_package_info docker-swh-loader-1 | last_modified = iso8601.parse_date(data["date"]).isoformat() docker-swh-loader-1 | KeyError: 'date' docker-swh-loader-1 | [2022-10-19 13:52:59,919: ERROR/ForkPoolWorker-4] Failed to get package info for version linux-64/0.20.0-py35_0 of https://anaconda.org/main/llvmlite docker-swh-loader-1 | Traceback (most recent call last): docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/loader.py", line 634, in load docker-swh-loader-1 | for branch_name, p_info in self.get_package_info(version): docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/conda/loader.py", line 125, in get_package_info docker-swh-loader-1 | last_modified = iso8601.parse_date(data["date"]).isoformat() docker-swh-loader-1 | KeyError: 'date' docker-swh-loader-1 | [2022-10-19 13:52:59,920: ERROR/ForkPoolWorker-4] Failed to get package info for version linux-64/0.20.0-py36_0 of https://anaconda.org/main/llvmlite docker-swh-loader-1 | Traceback (most recent call last): docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/loader.py", line 634, in load docker-swh-loader-1 | for branch_name, p_info in self.get_package_info(version): docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/conda/loader.py", line 125, in get_package_info docker-swh-loader-1 | last_modified = iso8601.parse_date(data["date"]).isoformat() docker-swh-loader-1 | KeyError: 'date' anlambert: date can be missing from listed artifacts, this needs to be handled:
```
[2022-10-19 13:52:59… | |||||||||||||||||||
# TODO: Make use of data["checksums"] | |||||||||||||||||||
p_info = CondaPackageInfo( | |||||||||||||||||||
name=pkgname, | |||||||||||||||||||
filename=filename, | |||||||||||||||||||
url=url, | |||||||||||||||||||
version=version, | |||||||||||||||||||
last_modified=last_modified, | |||||||||||||||||||
) | |||||||||||||||||||
Done Inline ActionsI landed D8595 about that feature, you can simply pass the checksums as parameter here to check downloaded tarball integrity: p_info = CondaPackageInfo( name=pkgname, filename=filename, url=url, version=version, last_modified=last_modified, checksums=data["checksums"], ) anlambert: I landed D8595 about that feature, you can simply pass the checksums as parameter here to check… | |||||||||||||||||||
yield release_name(version), p_info | |||||||||||||||||||
def build_release( | |||||||||||||||||||
self, p_info: CondaPackageInfo, uncompressed_path: str, directory: Sha1Git | |||||||||||||||||||
) -> Optional[Release]: | |||||||||||||||||||
# Extract intrinsic metadata from archive to get description and author | |||||||||||||||||||
metadata = extract_intrinsic_metadata(Path(uncompressed_path)) | |||||||||||||||||||
description: str = ( | |||||||||||||||||||
metadata.get("summary") or metadata.get("about", {}).get("summary") or "" | |||||||||||||||||||
) | |||||||||||||||||||
Done Inline Actions
that's simpler IMO (also, it will use metadata["about"]["summary"] if metadata["summary"] is empty. I don't know if it matters for any package, but there's not harm in doing that) vlorentz: that's simpler IMO
(also, it will use `metadata["about"]["summary"]` if `metadata["summary"]`… | |||||||||||||||||||
author = EMPTY_AUTHOR | |||||||||||||||||||
Done Inline ActionsLooks like maintainers[0] can be None for some edge cases: docker-swh-loader-1 | [2022-10-19 13:46:51,264: DEBUG/ForkPoolWorker-3] package_info: CondaPackageInfo(url='https://repo.anaconda.com/pkgs/main/linux-64/mkl-dpcpp-2021.3.0-h66538d2_521.tar.bz2', directory_extrinsic_metadata=[], checksums={'md5': 'e637920edc32a328881369008fc95203', 'sha256': '6c6e9a64ccbe997135297b5d0c59c255246d52d5f589e096a0e2db8ad73243be'}, name='mkl-dpcpp', filename='mkl-dpcpp-2021.3.0-h66538d2_521.tar.bz2', version='linux-64/2021.3.0-h66538d2_521', last_modified='2022-02-21T22:17:58.589000+00:00') docker-swh-loader-1 | [2022-10-19 13:46:51,524: DEBUG/ForkPoolWorker-3] filename: mkl-dpcpp-2021.3.0-h66538d2_521.tar.bz2 docker-swh-loader-1 | [2022-10-19 13:46:51,524: DEBUG/ForkPoolWorker-3] filepath: /tmp/tmpdvk30qj0/mkl-dpcpp-2021.3.0-h66538d2_521.tar.bz2 docker-swh-loader-1 | [2022-10-19 13:47:13,982: DEBUG/ForkPoolWorker-3] extrinsic_metadata docker-swh-loader-1 | [2022-10-19 13:48:08,558: DEBUG/ForkPoolWorker-3] uncompressed_path: /tmp/tmpdvk30qj0/src docker-swh-loader-1 | [2022-10-19 13:48:18,532: DEBUG/ForkPoolWorker-3] Filtered out 1 contents, 0 skipped contents and 0 directories docker-swh-loader-1 | [2022-10-19 13:48:18,532: DEBUG/ForkPoolWorker-3] Number of skipped contents: 1 docker-swh-loader-1 | [2022-10-19 13:48:18,532: DEBUG/ForkPoolWorker-3] Number of contents: 17 docker-swh-loader-1 | [2022-10-19 13:48:18,532: DEBUG/ForkPoolWorker-3] Number of directories: 10 docker-swh-loader-1 | [2022-10-19 13:48:18,689: ERROR/ForkPoolWorker-3] Failed to load branch releases/linux-64/2021.3.0-h66538d2_521 for https://anaconda.org/main/mkl-dpcpp docker-swh-loader-1 | Traceback (most recent call last): docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/loader.py", line 688, in load docker-swh-loader-1 | res = self._load_release(p_info, origin) docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/loader.py", line 877, in _load_release docker-swh-loader-1 | p_info, uncompressed_path, directory=directory.hash docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/conda/loader.py", line 148, in build_release docker-swh-loader-1 | author = Person.from_fullname(maintainers[0].encode()) docker-swh-loader-1 | AttributeError: 'NoneType' object has no attribute 'encode' docker-swh-loader-1 | [2022-10-19 13:48:18,692: DEBUG/ForkPoolWorker-3] package_info: CondaPackageInfo(url='https://repo.anaconda.com/pkgs/main/linux-64/mkl-dpcpp-2021.4.0-h66538d2_640.tar.bz2', directory_extrinsic_metadata=[], checksums={'md5': 'fe3fcabdefbf7fa7c6b7a603c71d3aff', 'sha256': 'b6f25a926bc6b94fb8fa38ca9c7caba894a5bd0b4d6fa948d8038f3dafc65eaa'}, name='mkl-dpcpp', filename='mkl-dpcpp-2021.4.0-h66538d2_640.tar.bz2', version='linux-64/2021.4.0-h66538d2_640', last_modified='2022-02-21T22:23:28.671000+00:00') docker-swh-loader-1 | [2022-10-19 13:48:18,904: DEBUG/ForkPoolWorker-3] filename: mkl-dpcpp-2021.4.0-h66538d2_640.tar.bz2 docker-swh-loader-1 | [2022-10-19 13:48:18,904: DEBUG/ForkPoolWorker-3] filepath: /tmp/tmphnski1iw/mkl-dpcpp-2021.4.0-h66538d2_640.tar.bz2 docker-swh-loader-1 | [2022-10-19 13:48:36,899: DEBUG/ForkPoolWorker-3] extrinsic_metadata docker-swh-loader-1 | [2022-10-19 13:49:26,037: DEBUG/ForkPoolWorker-3] uncompressed_path: /tmp/tmphnski1iw/src docker-swh-loader-1 | [2022-10-19 13:49:37,059: DEBUG/ForkPoolWorker-3] Filtered out 1 contents, 0 skipped contents and 0 directories docker-swh-loader-1 | [2022-10-19 13:49:37,059: DEBUG/ForkPoolWorker-3] Number of skipped contents: 1 docker-swh-loader-1 | [2022-10-19 13:49:37,059: DEBUG/ForkPoolWorker-3] Number of contents: 17 docker-swh-loader-1 | [2022-10-19 13:49:37,059: DEBUG/ForkPoolWorker-3] Number of directories: 10 docker-swh-loader-1 | [2022-10-19 13:49:37,272: ERROR/ForkPoolWorker-3] Failed to load branch releases/linux-64/2021.4.0-h66538d2_640 for https://anaconda.org/main/mkl-dpcpp docker-swh-loader-1 | Traceback (most recent call last): docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/loader.py", line 688, in load docker-swh-loader-1 | res = self._load_release(p_info, origin) docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/loader.py", line 877, in _load_release docker-swh-loader-1 | p_info, uncompressed_path, directory=directory.hash docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/conda/loader.py", line 148, in build_release docker-swh-loader-1 | author = Person.from_fullname(maintainers[0].encode()) docker-swh-loader-1 | AttributeError: 'NoneType' object has no attribute 'encode' docker-swh-loader-1 | [2022-10-19 13:49:37,272: DEBUG/ForkPoolWorker-3] package_info: CondaPackageInfo(url='https://repo.anaconda.com/pkgs/main/linux-64/mkl-dpcpp-2022.0.1-h66538d2_117.tar.bz2', directory_extrinsic_metadata=[], checksums={'md5': 'a858b82a575e3bc331abf2a49d3e9289', 'sha256': '8f6c3946a80e64a2ea703e97bbe1c80e50dc237021e4167feed4c061b7b5774a'}, name='mkl-dpcpp', filename='mkl-dpcpp-2022.0.1-h66538d2_117.tar.bz2', version='linux-64/2022.0.1-h66538d2_117', last_modified='2022-02-22T14:16:39.634000+00:00') docker-swh-loader-1 | [2022-10-19 13:49:37,522: DEBUG/ForkPoolWorker-3] filename: mkl-dpcpp-2022.0.1-h66538d2_117.tar.bz2 docker-swh-loader-1 | [2022-10-19 13:49:37,522: DEBUG/ForkPoolWorker-3] filepath: /tmp/tmpuijki00g/mkl-dpcpp-2022.0.1-h66538d2_117.tar.bz2 docker-swh-loader-1 | [2022-10-19 13:49:56,567: DEBUG/ForkPoolWorker-3] extrinsic_metadata docker-swh-loader-1 | [2022-10-19 13:50:39,400: DEBUG/ForkPoolWorker-3] uncompressed_path: /tmp/tmpuijki00g/src docker-swh-loader-1 | [2022-10-19 13:50:49,083: DEBUG/ForkPoolWorker-3] Filtered out 1 contents, 0 skipped contents and 0 directories docker-swh-loader-1 | [2022-10-19 13:50:49,084: DEBUG/ForkPoolWorker-3] Number of skipped contents: 1 docker-swh-loader-1 | [2022-10-19 13:50:49,084: DEBUG/ForkPoolWorker-3] Number of contents: 17 docker-swh-loader-1 | [2022-10-19 13:50:49,084: DEBUG/ForkPoolWorker-3] Number of directories: 10 docker-swh-loader-1 | [2022-10-19 13:50:49,313: ERROR/ForkPoolWorker-3] Failed to load branch releases/linux-64/2022.0.1-h66538d2_117 for https://anaconda.org/main/mkl-dpcpp docker-swh-loader-1 | Traceback (most recent call last): docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/loader.py", line 688, in load docker-swh-loader-1 | res = self._load_release(p_info, origin) docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/loader.py", line 877, in _load_release docker-swh-loader-1 | p_info, uncompressed_path, directory=directory.hash docker-swh-loader-1 | File "/tmp/tmp.0uo1le4WBL/swh-loader-core/swh/loader/package/conda/loader.py", line 148, in build_release docker-swh-loader-1 | author = Person.from_fullname(maintainers[0].encode()) docker-swh-loader-1 | AttributeError: 'NoneType' object has no attribute 'encode' anlambert: Looks like `maintainers[0]` can be `None` for some edge cases:
```
docker-swh-loader-1 | [2022… | |||||||||||||||||||
maintainers = metadata.get("extra", {}).get("recipe-maintainers") | |||||||||||||||||||
if maintainers and isinstance(maintainers, list) and any(maintainers): | |||||||||||||||||||
# TODO: here we have a list of author, see T3887 | |||||||||||||||||||
author = Person.from_fullname(maintainers[0].encode()) | |||||||||||||||||||
message = ( | |||||||||||||||||||
Done Inline Actions
Simpler. Also, you should add a check that maintainers is indeed a list, or we will silently end up with the first character as author. vlorentz: Simpler.
Also, you should add a check that `maintainers` is indeed a list, or we will silently… | |||||||||||||||||||
f"Synthetic release for Conda source package {p_info.name} " | |||||||||||||||||||
f"version {p_info.version}\n\n" | |||||||||||||||||||
f"{description}\n" | |||||||||||||||||||
anlambertUnsubmitted Done Inline Actionsif the description is empty, we do not need the two line breaks, I'll rather do: message = ( f"Synthetic release for Conda source package {p_info.name} " f"version {p_info.version}" ) if description: message += f"\n\n{description}" anlambert: if the description is empty, we do not need the two line breaks, I'll rather do… | |||||||||||||||||||
) | |||||||||||||||||||
return Release( | |||||||||||||||||||
name=p_info.version.encode(), | |||||||||||||||||||
author=author, | |||||||||||||||||||
date=TimestampWithTimezone.from_iso8601(p_info.last_modified), | |||||||||||||||||||
message=message.encode(), | |||||||||||||||||||
target_type=ObjectType.DIRECTORY, | |||||||||||||||||||
target=directory, | |||||||||||||||||||
synthetic=True, | |||||||||||||||||||
) |
You should return the first metadata file content you find here: