Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/pubdev/loader.py
| # Copyright (C) 2022 The Software Heritage developers | # Copyright (C) 2022 The Software Heritage developers | ||||
| # See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
| # License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
| # See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
| import json | import json | ||||
| from pathlib import Path | from typing import Dict, Iterator, Optional, Sequence, Tuple | ||||
| from typing import Any, Dict, Iterator, Optional, Sequence, Tuple | |||||
| import attr | import attr | ||||
| from packaging.version import parse as parse_version | from packaging.version import parse as parse_version | ||||
| import yaml | |||||
| from swh.loader.package.loader import BasePackageInfo, PackageLoader | from swh.loader.package.loader import BasePackageInfo, PackageLoader | ||||
| from swh.loader.package.utils import ( | from swh.loader.package.utils import ( | ||||
| EMPTY_AUTHOR, | EMPTY_AUTHOR, | ||||
| Person, | Person, | ||||
| cached_method, | cached_method, | ||||
| get_url_body, | get_url_body, | ||||
| release_name, | release_name, | ||||
| Show All 12 Lines | class PubDevPackageInfo(BasePackageInfo): | ||||
| """Current version""" | """Current version""" | ||||
| last_modified = attr.ib(type=str) | last_modified = attr.ib(type=str) | ||||
| """Last modified date as release date""" | """Last modified date as release date""" | ||||
| author = attr.ib(type=Person) | author = attr.ib(type=Person) | ||||
| """Author""" | """Author""" | ||||
| description = attr.ib(type=str) | |||||
| """Description""" | |||||
| def extract_intrinsic_metadata(dir_path: Path) -> Dict[str, Any]: | |||||
| """Extract intrinsic metadata from pubspec.yaml file at dir_path. | |||||
| Each pub.dev package version has a pubspec.yaml file at the root of the archive. | |||||
| See https://dart.dev/tools/pub/pubspec for pubspec specifications. | |||||
| Args: | |||||
| dir_path: A directory on disk where a pubspec.yaml must be present | |||||
| Returns: | |||||
| A dict mapping from yaml parser | |||||
| """ | |||||
| pubspec_path = dir_path / "pubspec.yaml" | |||||
| return yaml.safe_load(pubspec_path.read_text()) | |||||
| class PubDevLoader(PackageLoader[PubDevPackageInfo]): | class PubDevLoader(PackageLoader[PubDevPackageInfo]): | ||||
| visit_type = "pubdev" | visit_type = "pubdev" | ||||
| PUBDEV_BASE_URL = "https://pub.dev/" | PUBDEV_BASE_URL = "https://pub.dev/" | ||||
| def __init__( | def __init__( | ||||
| self, | self, | ||||
| storage: StorageInterface, | storage: StorageInterface, | ||||
| url: str, | url: str, | ||||
| **kwargs, | **kwargs, | ||||
| ): | ): | ||||
| super().__init__(storage=storage, url=url, **kwargs) | super().__init__(storage=storage, url=url, **kwargs) | ||||
| self.url = url | self.url = url | ||||
| assert url.startswith(self.PUBDEV_BASE_URL) | assert url.startswith(self.PUBDEV_BASE_URL) | ||||
| self.package_info_url = url.replace( | self.package_info_url = url.replace( | ||||
| self.PUBDEV_BASE_URL, f"{self.PUBDEV_BASE_URL}api/" | self.PUBDEV_BASE_URL, f"{self.PUBDEV_BASE_URL}api/" | ||||
| ) | ) | ||||
| def _raw_info(self) -> bytes: | |||||
| return get_url_body(self.package_info_url) | |||||
| @cached_method | @cached_method | ||||
| def info(self) -> Dict: | def info(self) -> Dict: | ||||
| """Return the project metadata information (fetched from pub.dev registry)""" | """Return the project metadata information (fetched from pub.dev registry)""" | ||||
| # Use strict=False in order to correctly manage case where \n is present in a string | # Use strict=False in order to correctly manage case where \n is present in a string | ||||
| info = json.loads(self._raw_info(), strict=False) | info = json.loads(get_url_body(self.package_info_url), strict=False) | ||||
| # Arrange versions list as a new dict with `version` as key | # Arrange versions list as a new dict with `version` as key | ||||
| versions = {v["version"]: v for v in info["versions"]} | versions = {v["version"]: v for v in info["versions"]} | ||||
| info["versions"] = versions | info["versions"] = versions | ||||
| return info | return info | ||||
| def get_versions(self) -> Sequence[str]: | def get_versions(self) -> Sequence[str]: | ||||
| """Get all released versions of a PubDev package | """Get all released versions of a PubDev package | ||||
| Show All 27 Lines | def get_package_info(self, version: str) -> Iterator[Tuple[str, PubDevPackageInfo]]: | ||||
| Package info comes from extrinsic metadata (from self.info()) | Package info comes from extrinsic metadata (from self.info()) | ||||
| Args: | Args: | ||||
| version: Package version (e.g: "0.1.0") | version: Package version (e.g: "0.1.0") | ||||
| Returns: | Returns: | ||||
| Iterator of tuple (release_name, p_info) | Iterator of tuple (release_name, p_info) | ||||
| """ | """ | ||||
| v = self.info()["versions"][version] | v = self.info()["versions"][version] | ||||
anlambert: As we are querying the pubdev Web API for getting package info, I think we should store the… | |||||
Done Inline Actionsyep sure, will add raw_extrinsic metadata in another patch franckbret: yep sure, will add raw_extrinsic metadata in another patch | |||||
| assert v["version"] == version | assert v["version"] == version | ||||
| url = v["archive_url"] | url = v["archive_url"] | ||||
| name = v["pubspec"]["name"] | name = v["pubspec"]["name"] | ||||
| filename = f"{name}-{version}.tar.gz" | filename = f"{name}-{version}.tar.gz" | ||||
| last_modified = v["published"] | last_modified = v["published"] | ||||
| checksums = {"sha256": v["archive_sha256"]} if v.get("archive_sha256") else {} | |||||
| if "authors" in v["pubspec"]: | authors = v.get("pubspec", {}).get("authors") | ||||
| if authors and isinstance(authors, list): | |||||
| # TODO: here we have a list of author, see T3887 | # TODO: here we have a list of author, see T3887 | ||||
Done Inline Actionswhat is the purpose of any(authors)? is it possible to have empty author names in the list? vlorentz: what is the purpose of `any(authors)`? is it possible to have empty author names in the list? | |||||
Done Inline ActionsCan"t remember, maybe when doing docker I met list with empty string franckbret: Can"t remember, maybe when doing docker I met list with empty string | |||||
Not Done Inline Actionsthen you shouldn't unconditionally take the first string if it may be empty. (and please add a regression test) vlorentz: then you shouldn't unconditionally take the first string if it may be empty.
(and please add a… | |||||
Done Inline ActionsI've runned a script to check the whole data set and did not found a case where authors is not a list or is a list with empty strings. DEBUG:10/11/2022 07:05:37 AM:Found 34924 packages, 298689 versions, 82619 author, 11462 authors, 204608 empty author So using Any here is useless for sure. franckbret: I've runned a script to check the whole data set and did not found a case where authors is not… | |||||
| author = Person.from_fullname(v["pubspec"]["authors"][0].encode()) | author = Person.from_fullname(authors[0].encode()) | ||||
| elif "author" in v["pubspec"] and v["pubspec"]["author"] is not None: | elif v.get("pubspec", {}).get("author"): | ||||
| author = Person.from_fullname(v["pubspec"]["author"].encode()) | author = Person.from_fullname(v["pubspec"]["author"].encode()) | ||||
| else: | else: | ||||
| author = EMPTY_AUTHOR | author = EMPTY_AUTHOR | ||||
| description = v["pubspec"]["description"] | |||||
| p_info = PubDevPackageInfo( | p_info = PubDevPackageInfo( | ||||
| name=name, | name=name, | ||||
| filename=filename, | filename=filename, | ||||
| url=url, | url=url, | ||||
| version=version, | version=version, | ||||
| last_modified=last_modified, | last_modified=last_modified, | ||||
| author=author, | author=author, | ||||
| description=description, | checksums=checksums, | ||||
| checksums={"sha256": v["archive_sha256"]}, | |||||
| ) | ) | ||||
| yield release_name(version), p_info | yield release_name(version), p_info | ||||
| def build_release( | def build_release( | ||||
| self, p_info: PubDevPackageInfo, uncompressed_path: str, directory: Sha1Git | self, p_info: PubDevPackageInfo, uncompressed_path: str, directory: Sha1Git | ||||
| ) -> Optional[Release]: | ) -> Optional[Release]: | ||||
| # Extract intrinsic metadata from uncompressed_path/pubspec.yaml | |||||
| intrinsic_metadata = extract_intrinsic_metadata(Path(uncompressed_path)) | |||||
| name: str = intrinsic_metadata["name"] | |||||
| version: str = intrinsic_metadata["version"] | |||||
| assert version == p_info.version | |||||
| # author from intrinsic_metadata should not take precedence over the one | |||||
| # returned by the api, see https://dart.dev/tools/pub/pubspec#authorauthors | |||||
| author: Person = p_info.author | |||||
| if "description" in intrinsic_metadata and intrinsic_metadata["description"]: | |||||
| description = intrinsic_metadata["description"] | |||||
| else: | |||||
| description = p_info.description | |||||
| message = ( | message = ( | ||||
| f"Synthetic release for pub.dev source package {name} " | f"Synthetic release for pub.dev source package {p_info.name} " | ||||
| f"version {version}\n\n" | f"version {p_info.version}\n" | ||||
| f"{description}\n" | |||||
| ) | ) | ||||
| return Release( | return Release( | ||||
| name=version.encode(), | name=p_info.version.encode(), | ||||
| author=author, | author=p_info.author, | ||||
| date=TimestampWithTimezone.from_iso8601(p_info.last_modified), | date=TimestampWithTimezone.from_iso8601(p_info.last_modified), | ||||
| message=message.encode(), | message=message.encode(), | ||||
| target_type=ObjectType.DIRECTORY, | target_type=ObjectType.DIRECTORY, | ||||
| target=directory, | target=directory, | ||||
| synthetic=True, | synthetic=True, | ||||
| ) | ) | ||||
As we are querying the pubdev Web API for getting package info, I think we should store the JSON data associated to a specific version as extrinsic metadata.
Nevertheless, this is out of scope for that diff and should be handled in a new one.