Page MenuHomeSoftware Heritage

D1810.diff
No OneTemporary

D1810.diff

diff --git a/swh/loader/package/download.py b/swh/loader/package/download.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/download.py
@@ -0,0 +1,158 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import requests
+
+try:
+ from _version import __version__
+except ImportError:
+ __version__ = 'devel'
+
+
+# This file contains methods to check and remove archived package version
+
+class compareField:
+ """Uses a field present in the metadata to check for if the package is
+ previously archived.
+
+ This class is to be used to identify and avoid the reprocessing of
+ previously archived package version using a field provided by the
+ API as the metadata of the package version.
+
+ The following operations are performed:
+ - Retrive known versions and store then in a dict with key of same
+ field that is mentioned in compare field
+ - Check if the tarballs are present in knowns versions.
+ - Instantiate a generator to process a specific package released
+ version
+
+ """
+ compare_field = None
+ """Field used to identify if the package version is previously archived"""
+ # eg for pypi loader compare_field = 'sha'
+
+ def __init__(self):
+ self.session = requests.session()
+ self.params = {
+ 'headers': {
+ 'User-Agent': 'Software Heritage Loader (%s)' % (
+ __version__
+ )
+ }
+ }
+
+ def get_key(self):
+ """Returns the key to be used to identify known revisions.
+
+ """
+ return self.compare_field
+
+ def get_known_versions(self, last_snapshot):
+ """
+ Retrieve the known release versions for the package
+ (i.e. those already ingested into the archive).
+
+ Args
+ last_snapshot (dict): Last snapshot for the visit.
+
+ Returns:
+ dict: Dict whose key is the value of field chosen for
+ checking archived artifacts and values are revision ids.
+
+ """
+ if not last_snapshot or 'branches' not in last_snapshot:
+ return {}
+
+ # retrieve only revisions (e.g the alias we do not want here)
+ revs = [rev['target']
+ for rev in last_snapshot['branches'].values()
+ if rev and rev['target_type'] == 'revision']
+ known_revisions = self.storage.revision_get(revs)
+ ret = {}
+ for revision in known_revisions:
+ if not revision: # revision_get can return None
+ continue
+ artifact = self.artifact_from_revision(revision)
+ ret[artifact[self.compare_field]] = revision['id'] # Check me
+ return ret
+
+ def artifact_from_revision(self, revision):
+ """Find artifacts from the revision.
+
+ """
+ # Can be overridden if change in standard revision pattern
+ if 'package' in revision['metadata']:
+ return revision['metadata']['package']
+
+ def filter_package_versions(self, tarballs, known_versions):
+ """
+ Return the available tarballs that are not previously archived.
+
+ Args:
+ tarballs (list): a list of dicts containing information about the
+ respective tarball that is provided by lister.
+ known_versions (dict): may be provided by the loader, it enables
+ to filter out versions already ingested in the archive.
+
+ Returns:
+ A list of dicts containing information about the respective
+ tarballs that are not previously archived.
+
+ """
+ # Done
+ versions = []
+
+ for release in tarballs:
+ if release[self.compare_field] in known_versions:
+ continue
+ versions.append(release)
+
+ return versions
+
+ def prepare_package_versions(self, tarballs, known_versions={}):
+ """
+ Instantiate a generator that will process a specific package release
+ version at each iteration step. The following operations will be
+ performed:
+
+ 1. Create a temporary directory to download and extract the
+ release tarball.
+ 2. Download the tarball.
+ 3. Uncompress the tarball.
+ 4. Parse the file associated to the package version to extract
+ metadata (optional).
+ 5. Delete unnecessary files (optional).
+
+ Args:
+ tarballs (list): a list of dicts containing information about the
+ respective tarball that is provided by lister.
+ known_versions (dict): may be provided by the loader, it enables
+ to filter out versions already ingested in the archive.
+
+ Yields:
+ Tuple[dict, str]: tuples containing the following
+ members:
+
+ * a dict holding package tarball information and metadata
+ * a string holding the path of the uncompressed package to
+ load into the archive
+
+ """
+ new_versions = self.filter_package_versions(tarballs, known_versions)
+ for package_source_data in new_versions:
+ # filter out version with missing tarball,
+ # package visit will be marked as partial at the end of
+ # the loading process
+
+ tarball_url = package_source_data['url']
+ tarball_request = self._request(tarball_url,
+ throw_error=False)
+ if tarball_request.status_code == 404:
+ self.log.debug('Tarball url %s returns a 404 error.',
+ tarball_url)
+ continue
+
+ yield self._prepare_package_version(package_source_data,
+ tarball_request)
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
--- a/swh/loader/package/loader.py
+++ b/swh/loader/package/loader.py
@@ -9,6 +9,7 @@
from abc import abstractmethod
from swh.loader.core.loader import BufferedLoader
from swh.loader.core.utils import clean_dangling_folders
+from swh.storage.algos.snapshot import snapshot_get_all_branches
from tempfile import mkdtemp
DEBUG_MODE = '** DEBUG MODE **'
@@ -109,6 +110,20 @@
"""
pass
+ def pre_cleanup(self):
+ """To prevent disk explosion if some other workers exploded
+ in mid-air (OOM killed), we try and clean up dangling files.
+
+ """
+ if self.debug:
+ self.log.warn('%s Will not pre-clean up temp dir %s' % (
+ DEBUG_MODE, self.temp_directory
+ ))
+ return
+ clean_dangling_folders(self.config['temp_directory'],
+ pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN,
+ log=self.log)
+
def prepare_origin_visit(self, *args, **kwargs):
"""Prepare package visit.
@@ -134,16 +149,33 @@
'type': self.loader_name,
}
- def pre_cleanup(self):
- """To prevent disk explosion if some other workers exploded
- in mid-air (OOM killed), we try and clean up dangling files.
+ def prepare(self, *args, **kwargs):
+ """Prepare effective loading of source tarballs for a package manager
+ package.
+
+ Args:
+ **kwargs: Arbitrary keyword arguments passed by the lister.
"""
- if self.debug:
- self.log.warn('%s Will not pre-clean up temp dir %s' % (
- DEBUG_MODE, self.temp_directory
- ))
- return
- clean_dangling_folders(self.config['temp_directory'],
- pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN,
- log=self.log)
+ self.package_contents = []
+ self.package_directories = []
+ self.package_revisions = []
+ self.package_source_data = []
+ self.package_temp_dir = os.path.join(self.temp_directory,
+ self.package_details['name'])
+
+ last_snapshot = self.last_snapshot()
+ self.known_versions = self.get_known_versions(last_snapshot)
+
+ self.new_versions = \
+ self.prepare_package_versions(self.package_details['tarballs'],
+ self.known_versions)
+
+ def last_snapshot(self):
+ """Retrieve the last snapshot of the package if any.
+
+ """
+ visit = self.storage.origin_visit_get_latest(
+ self.origin['url'], require_snapshot=True)
+ if visit:
+ return snapshot_get_all_branches(self.storage, visit['snapshot'])

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 23, 2:46 AM (20 h, 23 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219289

Event Timeline