Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7147894
D1694.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
5 KB
Subscribers
None
D1694.diff
View Options
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/loader.py
@@ -0,0 +1,149 @@
+
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+
+from abc import abstractmethod
+from swh.loader.core.loader import BufferedLoader
+from swh.loader.core.utils import clean_dangling_folders
+from tempfile import mkdtemp
+
+DEBUG_MODE = '** DEBUG MODE **'
+
+
+class PackageLoader(BufferedLoader):
+ """Package loader class for package manager loader
+
+ A loader is a component of the Software Heritage architecture responsible
+ for reading a source code origin and add new file contents in the object
+ storage and repository structure in the storage database.
+
+ The task of loader for package managers somehow similar for all
+ of them, notably it includes querying of an API to get metadata and
+ retrieval of the package source code and its ingestion into the archive.
+
+ The steps involving in ingestion of the package source code are automated
+ by this class which ease up the process of creating of the new loader.
+
+ API quering and obtaining the metadata for a package is done seperately
+ for each new loader by overriding the :func:`convert_to_standard_format`
+ function. It returns the all the information about the package in a
+ specific format. after which all the process involving downloading,
+ decompressing, creating and loading snapshots is automated by this class.
+
+ Required Overrides:
+ loader_name
+ class_name
+ def convert_to_standard_format
+
+ """
+
+ loader_name = None
+ """Package manager name""" # e.g pypi
+ class_name = None
+ """Loader class name""" # eg PyPILoader
+
+ def __init__(self):
+ self.TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.%s.' % self.loader_name
+
+ super().__init__(logging_class='swh.loader.%s.%s' % (self.loader_name,
+ self.class_name))
+
+ self.local_cache = None
+ self.dir_path = None
+
+ temp_directory = self.config['temp_directory']
+ os.makedirs(temp_directory, exist_ok=True)
+
+ self.temp_directory = mkdtemp(
+ suffix='-%s' % os.getpid(),
+ prefix=self.TEMPORARY_DIR_PREFIX_PATTERN,
+ dir=temp_directory)
+
+ self.debug = self.config.get('debug', False)
+
+ @abstractmethod
+ def fetch_metadata(self, kwargs):
+ """Fetch the metadata and convert it into a standard format
+
+ This method serve two purpose:
+ * Make API call to get the package versions and metadata(if needed)
+ * Convert the information received from lister and API call to a
+ standard format
+
+ The standard format is a dict with keys
+ `name` (str): Holding name of the package
+ `origin_url` (str): Holding the origin_url of the package
+ `tarballs` (list): A list of dicts where each dict contains
+ information related to a single version of the
+ package. The `url` key in the dict is necessary and will
+ hold tarball url. Other keys are optional and as per
+ availability of metadata.
+
+ Note: Keys `nature` and `response` are reserved keywords and cannot be
+ used in the dicts present in list under key `tarballs`
+
+ Args:
+ kwargs (dict): Dict of arbitrary keyword arguments passed by
+ the lister.
+
+ Returns:
+ dict: Containing information as directed by the guidelines
+ mentioned above
+
+ Example:
+ {
+ name:'8sync',
+ origin_url:'https://ftp.gnu.org/gnu/8sync/',
+ tarballs:[{url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
+ time_modified: 1562878592 },
+ {url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
+ time_modified: 1599887203 },
+ ...
+ ]
+ }
+
+ """
+ pass
+
+ def prepare_origin_visit(self, *args, **kwargs):
+ """Prepare package visit.
+
+ Args:
+ **kwargs: Arbitrary keyword arguments passed by the lister.
+
+ """
+ # reset statuses
+ self._load_status = 'uneventful'
+ self._visit_status = 'full'
+ self.done = False
+ # fetch the package metadata from the registry
+ self.package_details = self.fetch_metadata(kwargs)
+ self.set_origin()
+ self.visit_date = None # loader core will populate it
+
+ def set_origin(self):
+ """Assign value to self.origin.
+
+ """
+ self.origin = {
+ 'url': self.package_details['origin_url'],
+ 'type': self.loader_name,
+ }
+
+ def pre_cleanup(self):
+ """To prevent disk explosion if some other workers exploded
+ in mid-air (OOM killed), we try and clean up dangling files.
+
+ """
+ if self.debug:
+ self.log.warn('%s Will not pre-clean up temp dir %s' % (
+ DEBUG_MODE, self.temp_directory
+ ))
+ return
+ clean_dangling_folders(self.config['temp_directory'],
+ pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN,
+ log=self.log)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 23, 2:46 AM (17 h, 42 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217541
Attached To
D1694: swh.loader.package: Implement a method to prepare package visit
Event Timeline
Log In to Comment