Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/loader.py
- This file was added.
# Copyright (C) 2019 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import os | |||||
from abc import abstractmethod | |||||
from swh.loader.core.loader import BufferedLoader | |||||
from swh.loader.core.utils import clean_dangling_folders | |||||
from tempfile import mkdtemp | |||||
DEBUG_MODE = '** DEBUG MODE **' | |||||
class PackageLoader(BufferedLoader): | |||||
"""Package loader class for package manager loader | |||||
A loader is a component of the Software Heritage architecture responsible | |||||
for reading a source code origin and add new file contents in the object | |||||
storage and repository structure in the storage database. | |||||
The task of loader for package managers somehow similar for all | |||||
of them, notably it includes querying of an API to get metadata and | |||||
retrieval of the package source code and its ingestion into the archive. | |||||
The steps involving in ingestion of the package source code are automated | |||||
by this class which ease up the process of creating of the new loader. | |||||
API quering and obtaining the metadata for a package is done seperately | |||||
for each new loader by overriding the :func:`convert_to_standard_format` | |||||
function. It returns the all the information about the package in a | |||||
specific format. after which all the process involving downloading, | |||||
decompressing, creating and loading snapshots is automated by this class. | |||||
Required Overrides: | |||||
loader_name | |||||
class_name | |||||
def convert_to_standard_format | |||||
""" | |||||
loader_name = None | |||||
"""Package manager name""" # e.g pypi | |||||
class_name = None | |||||
"""Loader class name""" # eg PyPILoader | |||||
def __init__(self): | |||||
self.TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.%s.' % self.loader_name | |||||
super().__init__(logging_class='swh.loader.%s.%s' % (self.loader_name, | |||||
self.class_name)) | |||||
self.local_cache = None | |||||
self.dir_path = None | |||||
temp_directory = self.config['temp_directory'] | |||||
os.makedirs(temp_directory, exist_ok=True) | |||||
self.temp_directory = mkdtemp( | |||||
suffix='-%s' % os.getpid(), | |||||
prefix=self.TEMPORARY_DIR_PREFIX_PATTERN, | |||||
dir=temp_directory) | |||||
self.debug = self.config.get('debug', False) | |||||
@abstractmethod | |||||
def fetch_metadata(self, kwargs): | |||||
"""Fetch the metadata and convert it into a standard format | |||||
This method serve two purpose: | |||||
* Make API call to get the package versions and metadata(if needed) | |||||
* Convert the information received from lister and API call to a | |||||
standard format | |||||
The standard format is a dict with keys | |||||
`name` (str): Holding name of the package | |||||
`origin_url` (str): Holding the origin_url of the package | |||||
`tarballs` (list): A list of dicts where each dict contains | |||||
information related to a single version of the | |||||
package. The `url` key in the dict is necessary and will | |||||
hold tarball url. Other keys are optional and as per | |||||
availability of metadata. | |||||
Note: Keys `nature` and `response` are reserved keywords and cannot be | |||||
used in the dicts present in list under key `tarballs` | |||||
Args: | |||||
kwargs (dict): Dict of arbitrary keyword arguments passed by | |||||
the lister. | |||||
Returns: | |||||
dict: Containing information as directed by the guidelines | |||||
mentioned above | |||||
Example: | |||||
{ | |||||
name:'8sync', | |||||
origin_url:'https://ftp.gnu.org/gnu/8sync/', | |||||
tarballs:[{url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', | |||||
time_modified: 1562878592 }, | |||||
{url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', | |||||
time_modified: 1599887203 }, | |||||
... | |||||
] | |||||
} | |||||
""" | |||||
pass | |||||
def prepare_origin_visit(self, *args, **kwargs): | |||||
"""Prepare package visit. | |||||
Args: | |||||
**kwargs: Arbitrary keyword arguments passed by the lister. | |||||
""" | |||||
# reset statuses | |||||
self._load_status = 'uneventful' | |||||
self._visit_status = 'full' | |||||
self.done = False | |||||
# fetch the package metadata from the registry | |||||
self.package_details = self.fetch_metadata(kwargs) | |||||
self.set_origin() | |||||
self.visit_date = None # loader core will populate it | |||||
def set_origin(self): | |||||
"""Assign value to self.origin. | |||||
""" | |||||
self.origin = { | |||||
'url': self.package_details['origin_url'], | |||||
'type': self.loader_name, | |||||
} | |||||
def pre_cleanup(self): | |||||
"""To prevent disk explosion if some other workers exploded | |||||
in mid-air (OOM killed), we try and clean up dangling files. | |||||
""" | |||||
if self.debug: | |||||
self.log.warn('%s Will not pre-clean up temp dir %s' % ( | |||||
DEBUG_MODE, self.temp_directory | |||||
)) | |||||
return | |||||
clean_dangling_folders(self.config['temp_directory'], | |||||
pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN, | |||||
log=self.log) |