Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7147896
D1811.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
7 KB
Subscribers
None
D1811.diff
View Options
diff --git a/swh/loader/package/download.py b/swh/loader/package/download.py
--- a/swh/loader/package/download.py
+++ b/swh/loader/package/download.py
@@ -49,6 +49,25 @@
"""
return self.compare_field
+ def _request(self, url, throw_error=True):
+ """Request the remote tarball url.
+
+ Args:
+ url (str): Url (file or http*).
+
+ Raises:
+ ValueError in case of failing to query.
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath).
+
+ """
+ response = self.session.get(url, **self.params, stream=True)
+ if response.status_code != 200 and throw_error:
+ raise ValueError("Fail to query '%s'. Reason: %s" % (
+ url, response.status_code))
+ return response
+
def get_known_versions(self, last_snapshot):
"""
Retrieve the known release versions for the package
@@ -101,7 +120,6 @@
tarballs that are not previously archived.
"""
- # Done
versions = []
for release in tarballs:
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
--- a/swh/loader/package/loader.py
+++ b/swh/loader/package/loader.py
@@ -7,10 +7,14 @@
import os
from abc import abstractmethod
-from swh.loader.core.loader import BufferedLoader
+from tempfile import mkdtemp
+
+from swh.core import tarball
from swh.loader.core.utils import clean_dangling_folders
+from swh.loader.core.loader import BufferedLoader
+from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
from swh.storage.algos.snapshot import snapshot_get_all_branches
-from tempfile import mkdtemp
+
DEBUG_MODE = '** DEBUG MODE **'
@@ -40,6 +44,10 @@
class_name
def convert_to_standard_format
+ Optional Overrides:
+ def cleanup_artifact
+ def extract_metadata
+
"""
loader_name = None
@@ -110,6 +118,45 @@
"""
pass
+ def cleanup_artifact(self, uncompressed_path):
+ """Clean up unnecessary files from the downloaded tarball
+ also some special operation if needed.
+
+ Implementation of this method depends on the file structure of the
+ tarball. It is used to clean up files from the uncompressed tarball
+ that are not to be archived(eg binaries).
+
+ Args:
+ uncompressed_path (str): Path of uncompressed tarball
+
+ Returns:
+ uncompressed_path (str): Path of uncompressed tarball after
+ removing unnecessary files
+
+ """
+ return uncompressed_path
+
+ def extract_metadata(self, package_path, package_source_data):
+ """Fetch the metadata from the downloaded file.
+
+ Override this method to perform metadata extraction for each version
+ of a package from the uncompressed package by parsing over the file
+ containing metadata(package.json, PKG-INFO, ...).
+ Add the extracted metadata to the present `package_source_data` which
+ contains information related the focused package version.
+
+ Args:
+ package_path (str): Uncompressed package version path
+ package_source_data (dict): Information about the focused package
+ version.
+
+ Returns:
+ dict: Updated information about the focused package
+ version.
+
+ """
+ return package_source_data
+
def pre_cleanup(self):
"""To prevent disk explosion if some other workers exploded
in mid-air (OOM killed), we try and clean up dangling files.
@@ -179,3 +226,111 @@
self.origin['url'], require_snapshot=True)
if visit:
return snapshot_get_all_branches(self.storage, visit['snapshot'])
+
+ def _prepare_package_version(self, package_source_data, tarball_request):
+ """Process the package release version.
+
+ The following operations are performed:
+
+ 1. Download the tarball
+ 2. Uncompress the tarball
+ 3. Delete unnecessary files (optional)
+ 4. Parse the file associated to the package version to extract
+ metadata (optional)
+
+ Args:
+ package_source_data (dict): containing information
+ about the focused package version.
+ known_versions (dict): may be provided by the loader, it enables
+ to filter out versions already ingested in the archive.
+
+ Return:
+ Tuple[dict, str]: tuples containing the following
+ members:
+
+ * a dict holding package tarball information and metadata
+ * a string holding the path of the uncompressed package to
+ load into the archive
+
+ """
+ url = package_source_data['url']
+ tarball_path, hashes = self.download_generate_hash(tarball_request,
+ url)
+ uncompressed_path = os.path.join(self.package_temp_dir, 'uncompressed',
+ os.path.basename(url)) # SEE ME
+ package_source_data['nature'] = self.uncompress_tarball(
+ tarball_path, uncompressed_path)
+
+ # remove tarball
+ os.remove(tarball_path)
+
+ if self.tarball_invalid:
+ return None, None
+
+ package_path = self.cleanup_artifact(uncompressed_path)
+ package_source_data = self.extract_metadata(package_path,
+ package_source_data)
+ self.package_source_data.append(package_source_data)
+ return package_source_data, package_path
+
+ def download_generate_hash(self, response, url):
+ """Store file in temp directory and computes hash of its filepath.
+
+ Args:
+ response (Response): Server response of the url
+ url (str): Url of the tarball
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath)
+
+ """
+ length = int(response.headers['content-length'])
+ os.makedirs(self.package_temp_dir, exist_ok=True)
+ # SEE ME
+ filepath = os.path.join(self.package_temp_dir, os.path.basename(url))
+ h = self.write_file(filepath, length, response)
+ self.check_file(filepath, length)
+
+ hashes = {
+ 'length': length,
+ **h.hexdigest()
+ }
+ return filepath, hashes
+
+ def write_file(self, filepath, length, response):
+ """Convert the server response to a file.
+
+ """
+ h = MultiHash(length=length)
+ with open(filepath, 'wb') as f:
+ for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
+ h.update(chunk)
+ f.write(chunk)
+ return h
+
+ def check_file(self, filepath, length):
+ """Check for the validity of the tarball downloaded.
+
+ """
+ actual_length = os.path.getsize(filepath)
+ if length != actual_length:
+ raise ValueError('Error when checking size: %s != %s' % (
+ length, actual_length))
+
+ def uncompress_tarball(self, filepath, path):
+ """Uncompress a tarball.
+
+ Args:
+ filepath (str): Path of tarball to uncompress
+ path (str): The destination folder where to uncompress the tarball
+ Returns:
+ The nature of the tarball, zip or tar.
+
+ """
+ # filepath = tempdir + url
+ try:
+ self.tarball_invalid = False
+ return tarball.uncompress(filepath, path)
+ except Exception:
+ self.tarball_invalid = True
+ return None
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 23, 2:46 AM (20 h, 27 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219343
Attached To
D1811: swh.loader.package:Add add method to download, uncompress tarball and create hash. Introduce hookpoints to cleanup artifact and extranct metadata
Event Timeline
Log In to Comment