diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/loader.py @@ -0,0 +1,149 @@ + +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os + +from abc import abstractmethod +from swh.loader.core.loader import BufferedLoader +from swh.loader.core.utils import clean_dangling_folders +from tempfile import mkdtemp + +DEBUG_MODE = '** DEBUG MODE **' + + +class PackageLoader(BufferedLoader): + """Package loader class for package manager loader + + A loader is a component of the Software Heritage architecture responsible + for reading a source code origin and add new file contents in the object + storage and repository structure in the storage database. + + The task of loader for package managers somehow similar for all + of them, notably it includes querying of an API to get metadata and + retrieval of the package source code and its ingestion into the archive. + + The steps involving in ingestion of the package source code are automated + by this class which ease up the process of creating of the new loader. + + API quering and obtaining the metadata for a package is done seperately + for each new loader by overriding the :func:`convert_to_standard_format` + function. It returns the all the information about the package in a + specific format. after which all the process involving downloading, + decompressing, creating and loading snapshots is automated by this class. + + Required Overrides: + loader_name + class_name + def convert_to_standard_format + + """ + + loader_name = None + """Package manager name""" # e.g pypi + class_name = None + """Loader class name""" # eg PyPILoader + + def __init__(self): + self.TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.%s.' % self.loader_name + + super().__init__(logging_class='swh.loader.%s.%s' % (self.loader_name, + self.class_name)) + + self.local_cache = None + self.dir_path = None + + temp_directory = self.config['temp_directory'] + os.makedirs(temp_directory, exist_ok=True) + + self.temp_directory = mkdtemp( + suffix='-%s' % os.getpid(), + prefix=self.TEMPORARY_DIR_PREFIX_PATTERN, + dir=temp_directory) + + self.debug = self.config.get('debug', False) + + @abstractmethod + def fetch_metadata(self, kwargs): + """Fetch the metadata and convert it into a standard format + + This method serve two purpose: + * Make API call to get the package versions and metadata(if needed) + * Convert the information received from lister and API call to a + standard format + + The standard format is a dict with keys + `name` (str): Holding name of the package + `origin_url` (str): Holding the origin_url of the package + `tarballs` (list): A list of dicts where each dict contains + information related to a single version of the + package. The `url` key in the dict is necessary and will + hold tarball url. Other keys are optional and as per + availability of metadata. + + Note: Keys `nature` and `response` are reserved keywords and cannot be + used in the dicts present in list under key `tarballs` + + Args: + kwargs (dict): Dict of arbitrary keyword arguments passed by + the lister. + + Returns: + dict: Containing information as directed by the guidelines + mentioned above + + Example: + { + name:'8sync', + origin_url:'https://ftp.gnu.org/gnu/8sync/', + tarballs:[{url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', + time_modified: 1562878592 }, + {url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', + time_modified: 1599887203 }, + ... + ] + } + + """ + pass + + def prepare_origin_visit(self, *args, **kwargs): + """Prepare package visit. + + Args: + **kwargs: Arbitrary keyword arguments passed by the lister. + + """ + # reset statuses + self._load_status = 'uneventful' + self._visit_status = 'full' + self.done = False + # fetch the package metadata from the registry + self.package_details = self.fetch_metadata(kwargs) + self.set_origin() + self.visit_date = None # loader core will populate it + + def set_origin(self): + """Assign value to self.origin. + + """ + self.origin = { + 'url': self.package_details['origin_url'], + 'type': self.loader_name, + } + + def pre_cleanup(self): + """To prevent disk explosion if some other workers exploded + in mid-air (OOM killed), we try and clean up dangling files. + + """ + if self.debug: + self.log.warn('%s Will not pre-clean up temp dir %s' % ( + DEBUG_MODE, self.temp_directory + )) + return + clean_dangling_folders(self.config['temp_directory'], + pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN, + log=self.log)