diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -33,28 +33,65 @@ } -class Loader: - """The base class for a Software Heritage Loader. +class BaseLoader: + """Mixin base class for (D)VCS loaders (e.g svn, git, mercurial, ...). A loader retrieves origin information (git/mercurial/svn repositories, pypi/npm/... package artifacts), ingests the contents/directories/revisions/releases/snapshot to the storage backend. - For now, this just exposes 2 static methods (from_config, from_configfile) to - centralize and ease the loader instantiation. + The main entry point for the loader is the :func:`load` function. + + 2 static methods (:func:`from_config`, :func:`from_configfile`) centralizes and ease + the loader instantiation. + + You can take a look at some example classes: - Args: - storage: the instance of the Storage being used to register the - origin information + - :class:`SvnLoader` + - :class:`GitLoader` + - :class:`PypiLoader` """ def __init__( - self, storage: StorageInterface, max_content_size: Optional[int] = None, + self, + storage: StorageInterface, + logging_class: Optional[str] = None, + save_data_path: Optional[str] = None, + max_content_size: Optional[int] = None, ): + super().__init__() self.storage = storage self.max_content_size = int(max_content_size) if max_content_size else None + if logging_class is None: + logging_class = "%s.%s" % ( + self.__class__.__module__, + self.__class__.__name__, + ) + self.log = logging.getLogger(logging_class) + + _log = logging.getLogger("requests.packages.urllib3.connectionpool") + _log.setLevel(logging.WARN) + + # possibly overridden in self.prepare method + self.visit_date: Optional[datetime.datetime] = None + self.origin: Optional[Origin] = None + + if not hasattr(self, "visit_type"): + self.visit_type: Optional[str] = None + + self.origin_metadata: Dict[str, Any] = {} + self.loaded_snapshot_id: Optional[Sha1Git] = None + + if save_data_path: + path = save_data_path + os.stat(path) + if not os.access(path, os.R_OK | os.W_OK): + raise PermissionError("Permission denied: %r" % path) + + self.save_data_path = save_data_path + @classmethod def from_config(cls, storage: Dict[str, Any], **config: Any): """Instantiate a loader from a configuration dict. @@ -92,79 +129,6 @@ config.update({k: v for k, v in kwargs.items() if v is not None}) return cls.from_config(**config) - -class BaseLoader(Loader): - """Mixin base class for (D)VCS loaders (e.g svn, git, mercurial, ...). - - To define such loaders, you must: - - - inherit from this class - - - and implement following methods: - - - :func:`prepare`: First step executed by the loader to prepare some - state needed by the `func`:load method. - - - :func:`get_origin`: Retrieve the origin that is currently being loaded. - - - :func:`fetch_data`: Fetch the data is actually the method to implement - to compute data to inject in swh (through the store_data method) - - - :func:`store_data`: Store data fetched. - - - :func:`visit_status`: Explicit status of the visit ('partial' or - 'full') - - - :func:`load_status`: Explicit status of the loading, for use by the - scheduler (eventful/uneventful/temporary failure/permanent failure). - - - :func:`cleanup`: Last step executed by the loader. - - The entry point for the resulting loader is :func:`load`. - - You can take a look at some example classes: - - - :class:`SvnLoader` - - """ - - def __init__( - self, - storage: StorageInterface, - logging_class: Optional[str] = None, - save_data_path: Optional[str] = None, - max_content_size: Optional[int] = None, - ): - super().__init__(storage=storage, max_content_size=max_content_size) - - if logging_class is None: - logging_class = "%s.%s" % ( - self.__class__.__module__, - self.__class__.__name__, - ) - self.log = logging.getLogger(logging_class) - - _log = logging.getLogger("requests.packages.urllib3.connectionpool") - _log.setLevel(logging.WARN) - - # possibly overridden in self.prepare method - self.visit_date: Optional[datetime.datetime] = None - self.origin: Optional[Origin] = None - - if not hasattr(self, "visit_type"): - self.visit_type: Optional[str] = None - - self.origin_metadata: Dict[str, Any] = {} - self.loaded_snapshot_id: Optional[Sha1Git] = None - - if save_data_path: - path = save_data_path - os.stat(path) - if not os.access(path, os.R_OK | os.W_OK): - raise PermissionError("Permission denied: %r" % path) - - self.save_data_path = save_data_path - def save_data(self) -> None: """Save the data associated to the current load""" raise NotImplementedError @@ -202,7 +166,7 @@ """ raise NotImplementedError - def prepare_origin_visit(self, *args, **kwargs) -> None: + def prepare_origin_visit(self) -> None: """First step executed by the loader to prepare origin and visit references. Set/update self.origin, and optionally self.origin_url, self.visit_date. @@ -233,7 +197,7 @@ ) )[0] - def prepare(self, *args, **kwargs) -> None: + def prepare(self) -> None: """Second step executed by the loader to prepare some state needed by the loader. @@ -323,7 +287,7 @@ """ pass - def load(self, *args, **kwargs) -> Dict[str, str]: + def load(self) -> Dict[str, str]: r"""Loading logic for the loader to follow: - 1. Call :meth:`prepare_origin_visit` to prepare the @@ -347,7 +311,7 @@ msg = "Cleaning up dangling data failed! Continue loading." self.log.warning(msg) - self.prepare_origin_visit(*args, **kwargs) + self.prepare_origin_visit() self._store_origin_visit() assert ( @@ -361,7 +325,7 @@ ) try: - self.prepare(*args, **kwargs) + self.prepare() while True: more_data_to_fetch = self.fetch_data() @@ -391,7 +355,12 @@ self.log.exception( "Loading failure, updating to `%s` status", status, - extra={"swh_task_args": args, "swh_task_kwargs": kwargs,}, + extra={ + "swh_task_args": [], + "swh_task_kwargs": { + "origin": self.origin.url + }, + }, ) visit_status = OriginVisitStatus( origin=self.origin.url, diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -28,7 +28,7 @@ import sentry_sdk from swh.core.tarball import uncompress -from swh.loader.core.loader import Loader +from swh.loader.core.loader import BaseLoader from swh.loader.exception import NotFound from swh.loader.package.utils import download from swh.model import from_disk @@ -117,9 +117,10 @@ TPackageInfo = TypeVar("TPackageInfo", bound=BasePackageInfo) -class PackageLoader(Loader, Generic[TPackageInfo]): +class PackageLoader(BaseLoader, Generic[TPackageInfo]): # Origin visit type (str) set by the loader visit_type = "" + visit_date: datetime.datetime def __init__( self,