diff --git a/swh/loader/package/download.py b/swh/loader/package/download.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/download.py @@ -0,0 +1,158 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import requests + +try: + from _version import __version__ +except ImportError: + __version__ = 'devel' + + +# This file contains methods to check and remove archived package version + +class compareField: + """Uses a field present in the metadata to check for if the package is + previously archived. + + This class is to be used to identify and avoid the reprocessing of + previously archived package version using a field provided by the + API as the metadata of the package version. + + The following operations are performed: + - Retrive known versions and store then in a dict with key of same + field that is mentioned in compare field + - Check if the tarballs are present in knowns versions. + - Instantiate a generator to process a specific package released + version + + """ + compare_field = None + """Field used to identify if the package version is previously archived""" + # eg for pypi loader compare_field = 'sha' + + def __init__(self): + self.session = requests.session() + self.params = { + 'headers': { + 'User-Agent': 'Software Heritage Loader (%s)' % ( + __version__ + ) + } + } + + def get_key(self): + """Returns the key to be used to identify known revisions. + + """ + return self.compare_field + + def get_known_versions(self, last_snapshot): + """ + Retrieve the known release versions for the package + (i.e. those already ingested into the archive). + + Args + last_snapshot (dict): Last snapshot for the visit. + + Returns: + dict: Dict whose key is the value of field chosen for + checking archived artifacts and values are revision ids. + + """ + if not last_snapshot or 'branches' not in last_snapshot: + return {} + + # retrieve only revisions (e.g the alias we do not want here) + revs = [rev['target'] + for rev in last_snapshot['branches'].values() + if rev and rev['target_type'] == 'revision'] + known_revisions = self.storage.revision_get(revs) + ret = {} + for revision in known_revisions: + if not revision: # revision_get can return None + continue + artifact = self.artifact_from_revision(revision) + ret[artifact[self.compare_field]] = revision['id'] # Check me + return ret + + def artifact_from_revision(self, revision): + """Find artifacts from the revision. + + """ + # Can be overridden if change in standard revision pattern + if 'package' in revision['metadata']: + return revision['metadata']['package'] + + def filter_package_versions(self, tarballs, known_versions): + """ + Return the available tarballs that are not previously archived. + + Args: + tarballs (list): a list of dicts containing information about the + respective tarball that is provided by lister. + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Returns: + A list of dicts containing information about the respective + tarballs that are not previously archived. + + """ + # Done + versions = [] + + for release in tarballs: + if release[self.compare_field] in known_versions: + continue + versions.append(release) + + return versions + + def prepare_package_versions(self, tarballs, known_versions={}): + """ + Instantiate a generator that will process a specific package release + version at each iteration step. The following operations will be + performed: + + 1. Create a temporary directory to download and extract the + release tarball. + 2. Download the tarball. + 3. Uncompress the tarball. + 4. Parse the file associated to the package version to extract + metadata (optional). + 5. Delete unnecessary files (optional). + + Args: + tarballs (list): a list of dicts containing information about the + respective tarball that is provided by lister. + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Yields: + Tuple[dict, str]: tuples containing the following + members: + + * a dict holding package tarball information and metadata + * a string holding the path of the uncompressed package to + load into the archive + + """ + new_versions = self.filter_package_versions(tarballs, known_versions) + for package_source_data in new_versions: + # filter out version with missing tarball, + # package visit will be marked as partial at the end of + # the loading process + + tarball_url = package_source_data['url'] + tarball_request = self._request(tarball_url, + throw_error=False) + if tarball_request.status_code == 404: + self.log.debug('Tarball url %s returns a 404 error.', + tarball_url) + continue + + yield self._prepare_package_version(package_source_data, + tarball_request) diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -9,6 +9,7 @@ from abc import abstractmethod from swh.loader.core.loader import BufferedLoader from swh.loader.core.utils import clean_dangling_folders +from swh.storage.algos.snapshot import snapshot_get_all_branches from tempfile import mkdtemp DEBUG_MODE = '** DEBUG MODE **' @@ -109,6 +110,20 @@ """ pass + def pre_cleanup(self): + """To prevent disk explosion if some other workers exploded + in mid-air (OOM killed), we try and clean up dangling files. + + """ + if self.debug: + self.log.warn('%s Will not pre-clean up temp dir %s' % ( + DEBUG_MODE, self.temp_directory + )) + return + clean_dangling_folders(self.config['temp_directory'], + pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN, + log=self.log) + def prepare_origin_visit(self, *args, **kwargs): """Prepare package visit. @@ -134,16 +149,33 @@ 'type': self.loader_name, } - def pre_cleanup(self): - """To prevent disk explosion if some other workers exploded - in mid-air (OOM killed), we try and clean up dangling files. + def prepare(self, *args, **kwargs): + """Prepare effective loading of source tarballs for a package manager + package. + + Args: + **kwargs: Arbitrary keyword arguments passed by the lister. """ - if self.debug: - self.log.warn('%s Will not pre-clean up temp dir %s' % ( - DEBUG_MODE, self.temp_directory - )) - return - clean_dangling_folders(self.config['temp_directory'], - pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN, - log=self.log) + self.package_contents = [] + self.package_directories = [] + self.package_revisions = [] + self.package_source_data = [] + self.package_temp_dir = os.path.join(self.temp_directory, + self.package_details['name']) + + last_snapshot = self.last_snapshot() + self.known_versions = self.get_known_versions(last_snapshot) + + self.new_versions = \ + self.prepare_package_versions(self.package_details['tarballs'], + self.known_versions) + + def last_snapshot(self): + """Retrieve the last snapshot of the package if any. + + """ + visit = self.storage.origin_visit_get_latest( + self.origin['url'], require_snapshot=True) + if visit: + return snapshot_get_all_branches(self.storage, visit['snapshot'])