Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7147895
D1810.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
8 KB
Subscribers
None
D1810.diff
View Options
diff --git a/swh/loader/package/download.py b/swh/loader/package/download.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/download.py
@@ -0,0 +1,158 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import requests
+
+try:
+ from _version import __version__
+except ImportError:
+ __version__ = 'devel'
+
+
+# This file contains methods to check and remove archived package version
+
+class compareField:
+ """Uses a field present in the metadata to check for if the package is
+ previously archived.
+
+ This class is to be used to identify and avoid the reprocessing of
+ previously archived package version using a field provided by the
+ API as the metadata of the package version.
+
+ The following operations are performed:
+ - Retrive known versions and store then in a dict with key of same
+ field that is mentioned in compare field
+ - Check if the tarballs are present in knowns versions.
+ - Instantiate a generator to process a specific package released
+ version
+
+ """
+ compare_field = None
+ """Field used to identify if the package version is previously archived"""
+ # eg for pypi loader compare_field = 'sha'
+
+ def __init__(self):
+ self.session = requests.session()
+ self.params = {
+ 'headers': {
+ 'User-Agent': 'Software Heritage Loader (%s)' % (
+ __version__
+ )
+ }
+ }
+
+ def get_key(self):
+ """Returns the key to be used to identify known revisions.
+
+ """
+ return self.compare_field
+
+ def get_known_versions(self, last_snapshot):
+ """
+ Retrieve the known release versions for the package
+ (i.e. those already ingested into the archive).
+
+ Args
+ last_snapshot (dict): Last snapshot for the visit.
+
+ Returns:
+ dict: Dict whose key is the value of field chosen for
+ checking archived artifacts and values are revision ids.
+
+ """
+ if not last_snapshot or 'branches' not in last_snapshot:
+ return {}
+
+ # retrieve only revisions (e.g the alias we do not want here)
+ revs = [rev['target']
+ for rev in last_snapshot['branches'].values()
+ if rev and rev['target_type'] == 'revision']
+ known_revisions = self.storage.revision_get(revs)
+ ret = {}
+ for revision in known_revisions:
+ if not revision: # revision_get can return None
+ continue
+ artifact = self.artifact_from_revision(revision)
+ ret[artifact[self.compare_field]] = revision['id'] # Check me
+ return ret
+
+ def artifact_from_revision(self, revision):
+ """Find artifacts from the revision.
+
+ """
+ # Can be overridden if change in standard revision pattern
+ if 'package' in revision['metadata']:
+ return revision['metadata']['package']
+
+ def filter_package_versions(self, tarballs, known_versions):
+ """
+ Return the available tarballs that are not previously archived.
+
+ Args:
+ tarballs (list): a list of dicts containing information about the
+ respective tarball that is provided by lister.
+ known_versions (dict): may be provided by the loader, it enables
+ to filter out versions already ingested in the archive.
+
+ Returns:
+ A list of dicts containing information about the respective
+ tarballs that are not previously archived.
+
+ """
+ # Done
+ versions = []
+
+ for release in tarballs:
+ if release[self.compare_field] in known_versions:
+ continue
+ versions.append(release)
+
+ return versions
+
+ def prepare_package_versions(self, tarballs, known_versions={}):
+ """
+ Instantiate a generator that will process a specific package release
+ version at each iteration step. The following operations will be
+ performed:
+
+ 1. Create a temporary directory to download and extract the
+ release tarball.
+ 2. Download the tarball.
+ 3. Uncompress the tarball.
+ 4. Parse the file associated to the package version to extract
+ metadata (optional).
+ 5. Delete unnecessary files (optional).
+
+ Args:
+ tarballs (list): a list of dicts containing information about the
+ respective tarball that is provided by lister.
+ known_versions (dict): may be provided by the loader, it enables
+ to filter out versions already ingested in the archive.
+
+ Yields:
+ Tuple[dict, str]: tuples containing the following
+ members:
+
+ * a dict holding package tarball information and metadata
+ * a string holding the path of the uncompressed package to
+ load into the archive
+
+ """
+ new_versions = self.filter_package_versions(tarballs, known_versions)
+ for package_source_data in new_versions:
+ # filter out version with missing tarball,
+ # package visit will be marked as partial at the end of
+ # the loading process
+
+ tarball_url = package_source_data['url']
+ tarball_request = self._request(tarball_url,
+ throw_error=False)
+ if tarball_request.status_code == 404:
+ self.log.debug('Tarball url %s returns a 404 error.',
+ tarball_url)
+ continue
+
+ yield self._prepare_package_version(package_source_data,
+ tarball_request)
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
--- a/swh/loader/package/loader.py
+++ b/swh/loader/package/loader.py
@@ -9,6 +9,7 @@
from abc import abstractmethod
from swh.loader.core.loader import BufferedLoader
from swh.loader.core.utils import clean_dangling_folders
+from swh.storage.algos.snapshot import snapshot_get_all_branches
from tempfile import mkdtemp
DEBUG_MODE = '** DEBUG MODE **'
@@ -109,6 +110,20 @@
"""
pass
+ def pre_cleanup(self):
+ """To prevent disk explosion if some other workers exploded
+ in mid-air (OOM killed), we try and clean up dangling files.
+
+ """
+ if self.debug:
+ self.log.warn('%s Will not pre-clean up temp dir %s' % (
+ DEBUG_MODE, self.temp_directory
+ ))
+ return
+ clean_dangling_folders(self.config['temp_directory'],
+ pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN,
+ log=self.log)
+
def prepare_origin_visit(self, *args, **kwargs):
"""Prepare package visit.
@@ -134,16 +149,33 @@
'type': self.loader_name,
}
- def pre_cleanup(self):
- """To prevent disk explosion if some other workers exploded
- in mid-air (OOM killed), we try and clean up dangling files.
+ def prepare(self, *args, **kwargs):
+ """Prepare effective loading of source tarballs for a package manager
+ package.
+
+ Args:
+ **kwargs: Arbitrary keyword arguments passed by the lister.
"""
- if self.debug:
- self.log.warn('%s Will not pre-clean up temp dir %s' % (
- DEBUG_MODE, self.temp_directory
- ))
- return
- clean_dangling_folders(self.config['temp_directory'],
- pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN,
- log=self.log)
+ self.package_contents = []
+ self.package_directories = []
+ self.package_revisions = []
+ self.package_source_data = []
+ self.package_temp_dir = os.path.join(self.temp_directory,
+ self.package_details['name'])
+
+ last_snapshot = self.last_snapshot()
+ self.known_versions = self.get_known_versions(last_snapshot)
+
+ self.new_versions = \
+ self.prepare_package_versions(self.package_details['tarballs'],
+ self.known_versions)
+
+ def last_snapshot(self):
+ """Retrieve the last snapshot of the package if any.
+
+ """
+ visit = self.storage.origin_visit_get_latest(
+ self.origin['url'], require_snapshot=True)
+ if visit:
+ return snapshot_get_all_branches(self.storage, visit['snapshot'])
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 23, 2:46 AM (20 h, 23 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219289
Attached To
D1810: swh.loader.package: Implement method to remove known version.
Event Timeline
Log In to Comment