diff --git a/swh/loader/cli.py b/swh/loader/cli.py --- a/swh/loader/cli.py +++ b/swh/loader/cli.py @@ -93,5 +93,11 @@ registry_entry = LOADERS[type].load()() loader_cls = registry_entry["loader"] doc = inspect.getdoc(loader_cls).strip() - signature = inspect.signature(loader_cls) - click.echo(f"Loader: {doc}\nsignature: {signature}") + + # Hack to get the signature of the class even though it subclasses + # Generic, which reimplements __new__. + # See + signature = inspect.signature(loader_cls.__init__) + signature_str = str(signature).replace("self, ", "") + + click.echo(f"Loader: {doc}\nsignature: {signature_str}") diff --git a/swh/loader/package/archive/loader.py b/swh/loader/package/archive/loader.py --- a/swh/loader/package/archive/loader.py +++ b/swh/loader/package/archive/loader.py @@ -3,13 +3,14 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import iso8601 import logging - from os import path -from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple +from typing import Any, Dict, Iterator, Mapping, Optional, Sequence, Tuple + +import attr +import iso8601 -from swh.loader.package.loader import PackageLoader +from swh.loader.package.loader import PackageLoader, BasePackageInfo from swh.loader.package.utils import release_name, artifact_identity from swh.model.model import ( Sha1Git, @@ -29,7 +30,11 @@ REVISION_MESSAGE = b"swh-loader-package: synthetic revision message" -class ArchiveLoader(PackageLoader): +class ArchivePackageInfo(BasePackageInfo): + raw = attr.ib(type=Dict[str, Any]) + + +class ArchiveLoader(PackageLoader[ArchivePackageInfo]): """Load archive origin's artifact files into swh archive """ @@ -84,17 +89,17 @@ def get_package_info( self, version: str - ) -> Generator[Tuple[str, Mapping[str, Any]], None, None]: + ) -> Iterator[Tuple[str, ArchivePackageInfo]]: for a_metadata in self.artifacts: url = a_metadata["url"] package_version = a_metadata["version"] if version == package_version: filename = a_metadata.get("filename") - p_info = { - "url": url, - "filename": filename if filename else path.split(url)[-1], - "raw": a_metadata, - } + p_info = ArchivePackageInfo( + url=url, + filename=filename if filename else path.split(url)[-1], + raw=a_metadata, + ) # FIXME: this code assumes we have only 1 artifact per # versioned package yield release_name(version), p_info diff --git a/swh/loader/package/cran/loader.py b/swh/loader/package/cran/loader.py --- a/swh/loader/package/cran/loader.py +++ b/swh/loader/package/cran/loader.py @@ -5,17 +5,17 @@ import dateutil.parser import datetime +from datetime import timezone import os +from os import path import logging import re +from typing import Any, Dict, Iterator, List, Mapping, Optional, Tuple -from datetime import timezone -from os import path -from typing import Any, Generator, Dict, List, Mapping, Optional, Tuple - +import attr from debian.deb822 import Deb822 -from swh.loader.package.loader import PackageLoader +from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.loader.package.utils import release_name, artifact_identity from swh.model.model import ( Person, @@ -32,7 +32,11 @@ DATE_PATTERN = re.compile(r"^(?P\d{4})-(?P\d{2})$") -class CRANLoader(PackageLoader): +class CRANPackageInfo(BasePackageInfo): + raw = attr.ib(type=Dict[str, Any]) + + +class CRANLoader(PackageLoader[CRANPackageInfo]): visit_type = "cran" def __init__(self, url: str, artifacts: List[Dict]): @@ -57,18 +61,14 @@ def get_default_version(self) -> str: return self.artifacts[-1]["version"] - def get_package_info( - self, version: str - ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: + def get_package_info(self, version: str) -> Iterator[Tuple[str, CRANPackageInfo]]: for a_metadata in self.artifacts: url = a_metadata["url"] package_version = a_metadata["version"] if version == package_version: - p_info = { - "url": url, - "filename": path.basename(url), - "raw": a_metadata, - } + p_info = CRANPackageInfo( + url=url, filename=path.basename(url), raw=a_metadata, + ) yield release_name(version), p_info def resolve_revision_from( diff --git a/swh/loader/package/debian/loader.py b/swh/loader/package/debian/loader.py --- a/swh/loader/package/debian/loader.py +++ b/swh/loader/package/debian/loader.py @@ -8,13 +8,14 @@ from os import path import re import subprocess +from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple +import attr from dateutil.parser import parse as parse_date from debian.changelog import Changelog from debian.deb822 import Dsc -from typing import Any, Generator, List, Mapping, Optional, Sequence, Tuple -from swh.loader.package.loader import PackageLoader +from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.loader.package.utils import download, release_name from swh.model.model import ( Sha1Git, @@ -29,7 +30,11 @@ UPLOADERS_SPLIT = re.compile(r"(?<=\>)\s*,\s*") -class DebianLoader(PackageLoader): +class DebianPackageInfo(BasePackageInfo): + raw = attr.ib(type=Dict[str, Any]) + + +class DebianLoader(PackageLoader[DebianPackageInfo]): """Load debian origins into swh archive. """ @@ -86,12 +91,9 @@ """ return list(self.packages.keys()) - def get_package_info( - self, version: str - ) -> Generator[Tuple[str, Mapping[str, Any]], None, None]: + def get_package_info(self, version: str) -> Iterator[Tuple[str, DebianPackageInfo]]: meta = self.packages[version] - p_info = meta.copy() - p_info["raw"] = meta + p_info = DebianPackageInfo(url=self.url, filename=None, raw=meta,) yield release_name(version), p_info def resolve_revision_from( @@ -100,10 +102,10 @@ return resolve_revision_from(known_package_artifacts, artifact_metadata) def download_package( - self, p_info: Mapping[str, Any], tmpdir: str + self, p_info: DebianPackageInfo, tmpdir: str ) -> List[Tuple[str, Mapping]]: """Contrary to other package loaders (1 package, 1 artifact), - `a_metadata` represents the package's datafiles set to fetch: + `p_info.files` represents the package's datafiles set to fetch: - .orig.tar.gz - .dsc - .diff.gz @@ -111,7 +113,7 @@ This is delegated to the `download_package` function. """ - all_hashes = download_package(p_info, tmpdir) + all_hashes = download_package(p_info.raw, tmpdir) logger.debug("all_hashes: %s", all_hashes) res = [] for hashes in all_hashes.values(): diff --git a/swh/loader/package/deposit/loader.py b/swh/loader/package/deposit/loader.py --- a/swh/loader/package/deposit/loader.py +++ b/swh/loader/package/deposit/loader.py @@ -6,9 +6,10 @@ import json import logging import requests +from typing import Any, Dict, Iterator, List, Mapping, Optional, Sequence, Tuple, Union import types -from typing import Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple, Union +import attr from swh.model.hashutil import hash_to_hex, hash_to_bytes from swh.model.model import ( @@ -23,14 +24,19 @@ MetadataTargetType, RawExtrinsicMetadata, ) -from swh.loader.package.loader import PackageLoader +from swh.loader.package.loader import PackageLoader, BasePackageInfo from swh.loader.package.utils import download logger = logging.getLogger(__name__) -class DepositLoader(PackageLoader): +class DepositPackageInfo(BasePackageInfo): + filename = attr.ib(type=str) # instead of Optional[str] + raw = attr.ib(type=Dict[str, Any]) + + +class DepositLoader(PackageLoader[DepositPackageInfo]): """Load pypi origin's artifact releases into swh archive. """ @@ -59,20 +65,19 @@ def get_package_info( self, version: str - ) -> Generator[Tuple[str, Mapping[str, Any]], None, None]: - p_info = { - "filename": "archive.zip", - "raw": self.metadata, - } + ) -> Iterator[Tuple[str, DepositPackageInfo]]: + p_info = DepositPackageInfo( + url=self.url, filename="archive.zip", raw=self.metadata, + ) yield "HEAD", p_info def download_package( - self, p_info: Mapping[str, Any], tmpdir: str + self, p_info: DepositPackageInfo, tmpdir: str ) -> List[Tuple[str, Mapping]]: """Override to allow use of the dedicated deposit client """ - return [self.client.archive_get(self.deposit_id, tmpdir, p_info["filename"])] + return [self.client.archive_get(self.deposit_id, tmpdir, p_info.filename)] def build_revision( self, a_metadata: Dict, uncompressed_path: str, directory: Sha1Git @@ -109,7 +114,7 @@ "extrinsic": { "provider": self.client.metadata_url(self.deposit_id), "when": self.visit_date.isoformat(), - "raw": a_metadata, + "raw": a_metadata, # Actually the processed metadata instead of raw }, }, ) diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -8,7 +8,18 @@ import tempfile import os -from typing import Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple +from typing import ( + Any, + Dict, + Iterator, + Generic, + List, + Mapping, + Optional, + Sequence, + Tuple, + TypeVar, +) import attr import sentry_sdk @@ -38,7 +49,17 @@ logger = logging.getLogger(__name__) -class PackageLoader: +@attr.s +class BasePackageInfo: + url = attr.ib(type=str) + filename = attr.ib(type=Optional[str]) + raw = attr.ib(type=Any) + + +TPackageInfo = TypeVar("TPackageInfo", bound=BasePackageInfo) + + +class PackageLoader(Generic[TPackageInfo]): # Origin visit type (str) set by the loader visit_type = "" @@ -77,9 +98,7 @@ """ return [] - def get_package_info( - self, version: str - ) -> Generator[Tuple[str, Mapping[str, Any]], None, None]: + def get_package_info(self, version: str) -> Iterator[Tuple[str, TPackageInfo]]: """Given a release version of a package, retrieve the associated package information for such version. @@ -170,7 +189,7 @@ return None def download_package( - self, p_info: Mapping[str, Any], tmpdir: str + self, p_info: TPackageInfo, tmpdir: str ) -> List[Tuple[str, Mapping]]: """Download artifacts for a specific package. All downloads happen in in the tmpdir folder. @@ -191,9 +210,7 @@ List of (path, computed hashes) """ - a_uri = p_info["url"] - filename = p_info.get("filename") - return [download(a_uri, dest=tmpdir, filename=filename)] + return [download(p_info.url, dest=tmpdir, filename=p_info.filename)] def uncompress( self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]], dest: str @@ -328,7 +345,7 @@ # `p_` stands for `package_` for branch_name, p_info in self.get_package_info(version): logger.debug("package_info: %s", p_info) - revision_id = self.resolve_revision_from(known_artifacts, p_info["raw"]) + revision_id = self.resolve_revision_from(known_artifacts, p_info.raw) if revision_id is None: try: revision_id = self._load_revision(p_info, origin) @@ -377,7 +394,7 @@ return finalize_visit() - def _load_revision(self, p_info, origin) -> Optional[Sha1Git]: + def _load_revision(self, p_info: TPackageInfo, origin) -> Optional[Sha1Git]: """Does all the loading of a revision itself: * downloads a package and uncompresses it @@ -414,7 +431,7 @@ # FIXME: This should be release. cf. D409 revision = self.build_revision( - p_info["raw"], uncompressed_path, directory=directory.hash + p_info.raw, uncompressed_path, directory=directory.hash ) if not revision: # Some artifacts are missing intrinsic metadata diff --git a/swh/loader/package/nixguix/loader.py b/swh/loader/package/nixguix/loader.py --- a/swh/loader/package/nixguix/loader.py +++ b/swh/loader/package/nixguix/loader.py @@ -6,8 +6,9 @@ import json import logging import requests +from typing import Any, Dict, Iterator, Mapping, Optional, Tuple -from typing import Dict, Optional, Any, Mapping +import attr from swh.model import hashutil from swh.model.model import ( @@ -20,13 +21,17 @@ ) from swh.loader.package.utils import EMPTY_AUTHOR -from swh.loader.package.loader import PackageLoader +from swh.loader.package.loader import BasePackageInfo, PackageLoader logger = logging.getLogger(__name__) -class NixGuixLoader(PackageLoader): +class NixGuixPackageInfo(BasePackageInfo): + raw = attr.ib(type=Dict[str, Any]) + + +class NixGuixLoader(PackageLoader[NixGuixPackageInfo]): """Load sources from a sources.json file. This loader is used to load sources used by functional package manager (eg. Nix and Guix). @@ -59,13 +64,16 @@ # Note: this could be renamed get_artifact_info in the PackageLoader # base class. - def get_package_info(self, url): + def get_package_info(self, url) -> Iterator[Tuple[str, NixGuixPackageInfo]]: # TODO: try all mirrors and not only the first one. A source # can be fetched from several urls, called mirrors. We # currently only use the first one, but if the first one # fails, we should try the second one and so on. integrity = self._integrityByUrl[url] - yield url, {"url": url, "raw": {"url": url, "integrity": integrity}} + p_info = NixGuixPackageInfo( + url=url, filename=None, raw={"url": url, "integrity": integrity}, + ) + yield url, p_info def known_artifacts(self, snapshot: Optional[Snapshot]) -> Dict[Sha1Git, BaseModel]: """Almost same implementation as the default one except it filters out the extra diff --git a/swh/loader/package/npm/loader.py b/swh/loader/package/npm/loader.py --- a/swh/loader/package/npm/loader.py +++ b/swh/loader/package/npm/loader.py @@ -8,7 +8,7 @@ import os from codecs import BOM_UTF8 -from typing import Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple, Union +from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union import attr import chardet @@ -22,7 +22,7 @@ Sha1Git, ) -from swh.loader.package.loader import PackageLoader +from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.loader.package.utils import api_info, release_name @@ -32,7 +32,11 @@ EMPTY_PERSON = Person(fullname=b"", name=None, email=None) -class NpmLoader(PackageLoader): +class NpmPackageInfo(BasePackageInfo): + raw = attr.ib(type=Dict[str, Any]) + + +class NpmLoader(PackageLoader[NpmPackageInfo]): """Load npm origin's artifact releases into swh archive. """ @@ -67,16 +71,10 @@ def get_default_version(self) -> str: return self.info["dist-tags"].get("latest", "") - def get_package_info( - self, version: str - ) -> Generator[Tuple[str, Mapping[str, Any]], None, None]: + def get_package_info(self, version: str) -> Iterator[Tuple[str, NpmPackageInfo]]: meta = self.info["versions"][version] url = meta["dist"]["tarball"] - p_info = { - "url": url, - "filename": os.path.basename(url), - "raw": meta, - } + p_info = NpmPackageInfo(url=url, filename=os.path.basename(url), raw=meta,) yield release_name(version), p_info def resolve_revision_from( diff --git a/swh/loader/package/pypi/loader.py b/swh/loader/package/pypi/loader.py --- a/swh/loader/package/pypi/loader.py +++ b/swh/loader/package/pypi/loader.py @@ -5,9 +5,10 @@ import os import logging - -from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple +from typing import Any, Dict, Iterator, Optional, Sequence, Tuple from urllib.parse import urlparse + +import attr from pkginfo import UnpackedSDist from swh.model.model import ( @@ -18,13 +19,17 @@ RevisionType, ) -from swh.loader.package.loader import PackageLoader +from swh.loader.package.loader import BasePackageInfo, PackageLoader from swh.loader.package.utils import api_info, release_name, EMPTY_AUTHOR logger = logging.getLogger(__name__) -class PyPILoader(PackageLoader): +class PyPIPackageInfo(BasePackageInfo): + raw = attr.ib(type=Dict[str, Any]) + + +class PyPILoader(PackageLoader[PyPIPackageInfo]): """Load pypi origin's artifact releases into swh archive. """ @@ -51,19 +56,13 @@ def get_default_version(self) -> str: return self.info["info"]["version"] - def get_package_info( - self, version: str - ) -> Generator[Tuple[str, Mapping[str, Any]], None, None]: + def get_package_info(self, version: str) -> Iterator[Tuple[str, PyPIPackageInfo]]: res = [] for meta in self.info["releases"][version]: if meta["packagetype"] != "sdist": continue filename = meta["filename"] - p_info = { - "url": meta["url"], - "filename": filename, - "raw": meta, - } + p_info = PyPIPackageInfo(url=meta["url"], filename=filename, raw=meta,) res.append((version, p_info)) if len(res) == 1: @@ -71,7 +70,7 @@ yield release_name(version), p_info else: for version, p_info in res: - yield release_name(version, p_info["filename"]), p_info + yield release_name(version, p_info.filename), p_info def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict