Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/pubdev/loader.py
# Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import json | import json | ||||
from pathlib import Path | from typing import Dict, Iterator, Optional, Sequence, Tuple | ||||
from typing import Any, Dict, Iterator, Optional, Sequence, Tuple | |||||
import attr | import attr | ||||
from packaging.version import parse as parse_version | from packaging.version import parse as parse_version | ||||
import yaml | |||||
from swh.loader.package.loader import BasePackageInfo, PackageLoader | from swh.loader.package.loader import BasePackageInfo, PackageLoader | ||||
from swh.loader.package.utils import ( | from swh.loader.package.utils import ( | ||||
EMPTY_AUTHOR, | EMPTY_AUTHOR, | ||||
Person, | Person, | ||||
cached_method, | cached_method, | ||||
get_url_body, | get_url_body, | ||||
release_name, | release_name, | ||||
Show All 12 Lines | class PubDevPackageInfo(BasePackageInfo): | ||||
"""Current version""" | """Current version""" | ||||
last_modified = attr.ib(type=str) | last_modified = attr.ib(type=str) | ||||
"""Last modified date as release date""" | """Last modified date as release date""" | ||||
author = attr.ib(type=Person) | author = attr.ib(type=Person) | ||||
"""Author""" | """Author""" | ||||
description = attr.ib(type=str) | |||||
"""Description""" | |||||
def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: | |||||
"""Extract intrinsic metadata from pubspec.yaml file at dir_path. | |||||
Each pub.dev package version has a pubspec.yaml file at the root of the archive. | |||||
See https://dart.dev/tools/pub/pubspec for pubspec specifications. | |||||
Args: | |||||
dir_path: A directory on disk where a pubspec.yaml must be present | |||||
Returns: | |||||
A dict mapping from yaml parser | |||||
""" | |||||
pubspec_path = dir_path / "pubspec.yaml" | |||||
return yaml.safe_load(pubspec_path.read_text()) | |||||
class PubDevLoader(PackageLoader[PubDevPackageInfo]): | class PubDevLoader(PackageLoader[PubDevPackageInfo]): | ||||
visit_type = "pubdev" | visit_type = "pubdev" | ||||
PUBDEV_BASE_URL = "https://pub.dev/" | PUBDEV_BASE_URL = "https://pub.dev/" | ||||
def __init__( | def __init__( | ||||
self, | self, | ||||
storage: StorageInterface, | storage: StorageInterface, | ||||
url: str, | url: str, | ||||
**kwargs, | **kwargs, | ||||
): | ): | ||||
super().__init__(storage=storage, url=url, **kwargs) | super().__init__(storage=storage, url=url, **kwargs) | ||||
self.url = url | self.url = url | ||||
assert url.startswith(self.PUBDEV_BASE_URL) | assert url.startswith(self.PUBDEV_BASE_URL) | ||||
self.package_info_url = url.replace( | self.package_info_url = url.replace( | ||||
self.PUBDEV_BASE_URL, f"{self.PUBDEV_BASE_URL}api/" | self.PUBDEV_BASE_URL, f"{self.PUBDEV_BASE_URL}api/" | ||||
) | ) | ||||
def _raw_info(self) -> bytes: | |||||
return get_url_body(self.package_info_url) | |||||
@cached_method | @cached_method | ||||
def info(self) -> Dict: | def info(self) -> Dict: | ||||
"""Return the project metadata information (fetched from pub.dev registry)""" | """Return the project metadata information (fetched from pub.dev registry)""" | ||||
# Use strict=False in order to correctly manage case where \n is present in a string | # Use strict=False in order to correctly manage case where \n is present in a string | ||||
info = json.loads(self._raw_info(), strict=False) | info = json.loads(get_url_body(self.package_info_url), strict=False) | ||||
# Arrange versions list as a new dict with `version` as key | # Arrange versions list as a new dict with `version` as key | ||||
versions = {v["version"]: v for v in info["versions"]} | versions = {v["version"]: v for v in info["versions"]} | ||||
info["versions"] = versions | info["versions"] = versions | ||||
return info | return info | ||||
def get_versions(self) -> Sequence[str]: | def get_versions(self) -> Sequence[str]: | ||||
"""Get all released versions of a PubDev package | """Get all released versions of a PubDev package | ||||
Show All 27 Lines | def get_package_info(self, version: str) -> Iterator[Tuple[str, PubDevPackageInfo]]: | ||||
Package info comes from extrinsic metadata (from self.info()) | Package info comes from extrinsic metadata (from self.info()) | ||||
Args: | Args: | ||||
version: Package version (e.g: "0.1.0") | version: Package version (e.g: "0.1.0") | ||||
Returns: | Returns: | ||||
Iterator of tuple (release_name, p_info) | Iterator of tuple (release_name, p_info) | ||||
""" | """ | ||||
v = self.info()["versions"][version] | v = self.info()["versions"][version] | ||||
anlambert: As we are querying the pubdev Web API for getting package info, I think we should store the… | |||||
Done Inline Actionsyep sure, will add raw_extrinsic metadata in another patch franckbret: yep sure, will add raw_extrinsic metadata in another patch | |||||
assert v["version"] == version | assert v["version"] == version | ||||
url = v["archive_url"] | url = v["archive_url"] | ||||
name = v["pubspec"]["name"] | name = v["pubspec"]["name"] | ||||
filename = f"{name}-{version}.tar.gz" | filename = f"{name}-{version}.tar.gz" | ||||
last_modified = v["published"] | last_modified = v["published"] | ||||
checksums = {"sha256": v["archive_sha256"]} if v.get("archive_sha256") else {} | |||||
if "authors" in v["pubspec"]: | authors = v.get("pubspec", {}).get("authors") | ||||
if authors and isinstance(authors, list): | |||||
# TODO: here we have a list of author, see T3887 | # TODO: here we have a list of author, see T3887 | ||||
Done Inline Actionswhat is the purpose of any(authors)? is it possible to have empty author names in the list? vlorentz: what is the purpose of `any(authors)`? is it possible to have empty author names in the list? | |||||
Done Inline ActionsCan"t remember, maybe when doing docker I met list with empty string franckbret: Can"t remember, maybe when doing docker I met list with empty string | |||||
Not Done Inline Actionsthen you shouldn't unconditionally take the first string if it may be empty. (and please add a regression test) vlorentz: then you shouldn't unconditionally take the first string if it may be empty.
(and please add a… | |||||
Done Inline ActionsI've runned a script to check the whole data set and did not found a case where authors is not a list or is a list with empty strings. DEBUG:10/11/2022 07:05:37 AM:Found 34924 packages, 298689 versions, 82619 author, 11462 authors, 204608 empty author So using Any here is useless for sure. franckbret: I've runned a script to check the whole data set and did not found a case where authors is not… | |||||
author = Person.from_fullname(v["pubspec"]["authors"][0].encode()) | author = Person.from_fullname(authors[0].encode()) | ||||
elif "author" in v["pubspec"] and v["pubspec"]["author"] is not None: | elif v.get("pubspec", {}).get("author"): | ||||
author = Person.from_fullname(v["pubspec"]["author"].encode()) | author = Person.from_fullname(v["pubspec"]["author"].encode()) | ||||
else: | else: | ||||
author = EMPTY_AUTHOR | author = EMPTY_AUTHOR | ||||
description = v["pubspec"]["description"] | |||||
p_info = PubDevPackageInfo( | p_info = PubDevPackageInfo( | ||||
name=name, | name=name, | ||||
filename=filename, | filename=filename, | ||||
url=url, | url=url, | ||||
version=version, | version=version, | ||||
last_modified=last_modified, | last_modified=last_modified, | ||||
author=author, | author=author, | ||||
description=description, | checksums=checksums, | ||||
checksums={"sha256": v["archive_sha256"]}, | |||||
) | ) | ||||
yield release_name(version), p_info | yield release_name(version), p_info | ||||
def build_release( | def build_release( | ||||
self, p_info: PubDevPackageInfo, uncompressed_path: str, directory: Sha1Git | self, p_info: PubDevPackageInfo, uncompressed_path: str, directory: Sha1Git | ||||
) -> Optional[Release]: | ) -> Optional[Release]: | ||||
# Extract intrinsic metadata from uncompressed_path/pubspec.yaml | |||||
intrinsic_metadata = extract_intrinsic_metadata(Path(uncompressed_path)) | |||||
name: str = intrinsic_metadata["name"] | |||||
version: str = intrinsic_metadata["version"] | |||||
assert version == p_info.version | |||||
# author from intrinsic_metadata should not take precedence over the one | |||||
# returned by the api, see https://dart.dev/tools/pub/pubspec#authorauthors | |||||
author: Person = p_info.author | |||||
if "description" in intrinsic_metadata and intrinsic_metadata["description"]: | |||||
description = intrinsic_metadata["description"] | |||||
else: | |||||
description = p_info.description | |||||
message = ( | message = ( | ||||
f"Synthetic release for pub.dev source package {name} " | f"Synthetic release for pub.dev source package {p_info.name} " | ||||
f"version {version}\n\n" | f"version {p_info.version}\n" | ||||
f"{description}\n" | |||||
) | ) | ||||
return Release( | return Release( | ||||
name=version.encode(), | name=p_info.version.encode(), | ||||
author=author, | author=p_info.author, | ||||
date=TimestampWithTimezone.from_iso8601(p_info.last_modified), | date=TimestampWithTimezone.from_iso8601(p_info.last_modified), | ||||
message=message.encode(), | message=message.encode(), | ||||
target_type=ObjectType.DIRECTORY, | target_type=ObjectType.DIRECTORY, | ||||
target=directory, | target=directory, | ||||
synthetic=True, | synthetic=True, | ||||
) | ) |
As we are querying the pubdev Web API for getting package info, I think we should store the JSON data associated to a specific version as extrinsic metadata.
Nevertheless, this is out of scope for that diff and should be handled in a new one.