Page MenuHomeSoftware Heritage

D1694.diff
No OneTemporary

D1694.diff

diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/loader.py
@@ -0,0 +1,149 @@
+
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+
+from abc import abstractmethod
+from swh.loader.core.loader import BufferedLoader
+from swh.loader.core.utils import clean_dangling_folders
+from tempfile import mkdtemp
+
+DEBUG_MODE = '** DEBUG MODE **'
+
+
+class PackageLoader(BufferedLoader):
+ """Package loader class for package manager loader
+
+ A loader is a component of the Software Heritage architecture responsible
+ for reading a source code origin and add new file contents in the object
+ storage and repository structure in the storage database.
+
+ The task of loader for package managers somehow similar for all
+ of them, notably it includes querying of an API to get metadata and
+ retrieval of the package source code and its ingestion into the archive.
+
+ The steps involving in ingestion of the package source code are automated
+ by this class which ease up the process of creating of the new loader.
+
+ API quering and obtaining the metadata for a package is done seperately
+ for each new loader by overriding the :func:`convert_to_standard_format`
+ function. It returns the all the information about the package in a
+ specific format. after which all the process involving downloading,
+ decompressing, creating and loading snapshots is automated by this class.
+
+ Required Overrides:
+ loader_name
+ class_name
+ def convert_to_standard_format
+
+ """
+
+ loader_name = None
+ """Package manager name""" # e.g pypi
+ class_name = None
+ """Loader class name""" # eg PyPILoader
+
+ def __init__(self):
+ self.TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.%s.' % self.loader_name
+
+ super().__init__(logging_class='swh.loader.%s.%s' % (self.loader_name,
+ self.class_name))
+
+ self.local_cache = None
+ self.dir_path = None
+
+ temp_directory = self.config['temp_directory']
+ os.makedirs(temp_directory, exist_ok=True)
+
+ self.temp_directory = mkdtemp(
+ suffix='-%s' % os.getpid(),
+ prefix=self.TEMPORARY_DIR_PREFIX_PATTERN,
+ dir=temp_directory)
+
+ self.debug = self.config.get('debug', False)
+
+ @abstractmethod
+ def fetch_metadata(self, kwargs):
+ """Fetch the metadata and convert it into a standard format
+
+ This method serve two purpose:
+ * Make API call to get the package versions and metadata(if needed)
+ * Convert the information received from lister and API call to a
+ standard format
+
+ The standard format is a dict with keys
+ `name` (str): Holding name of the package
+ `origin_url` (str): Holding the origin_url of the package
+ `tarballs` (list): A list of dicts where each dict contains
+ information related to a single version of the
+ package. The `url` key in the dict is necessary and will
+ hold tarball url. Other keys are optional and as per
+ availability of metadata.
+
+ Note: Keys `nature` and `response` are reserved keywords and cannot be
+ used in the dicts present in list under key `tarballs`
+
+ Args:
+ kwargs (dict): Dict of arbitrary keyword arguments passed by
+ the lister.
+
+ Returns:
+ dict: Containing information as directed by the guidelines
+ mentioned above
+
+ Example:
+ {
+ name:'8sync',
+ origin_url:'https://ftp.gnu.org/gnu/8sync/',
+ tarballs:[{url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
+ time_modified: 1562878592 },
+ {url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
+ time_modified: 1599887203 },
+ ...
+ ]
+ }
+
+ """
+ pass
+
+ def prepare_origin_visit(self, *args, **kwargs):
+ """Prepare package visit.
+
+ Args:
+ **kwargs: Arbitrary keyword arguments passed by the lister.
+
+ """
+ # reset statuses
+ self._load_status = 'uneventful'
+ self._visit_status = 'full'
+ self.done = False
+ # fetch the package metadata from the registry
+ self.package_details = self.fetch_metadata(kwargs)
+ self.set_origin()
+ self.visit_date = None # loader core will populate it
+
+ def set_origin(self):
+ """Assign value to self.origin.
+
+ """
+ self.origin = {
+ 'url': self.package_details['origin_url'],
+ 'type': self.loader_name,
+ }
+
+ def pre_cleanup(self):
+ """To prevent disk explosion if some other workers exploded
+ in mid-air (OOM killed), we try and clean up dangling files.
+
+ """
+ if self.debug:
+ self.log.warn('%s Will not pre-clean up temp dir %s' % (
+ DEBUG_MODE, self.temp_directory
+ ))
+ return
+ clean_dangling_folders(self.config['temp_directory'],
+ pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN,
+ log=self.log)

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 23, 2:46 AM (20 h, 12 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217541

Event Timeline