Page MenuHomeSoftware Heritage

D1811.diff
No OneTemporary

D1811.diff

diff --git a/swh/loader/package/download.py b/swh/loader/package/download.py
--- a/swh/loader/package/download.py
+++ b/swh/loader/package/download.py
@@ -49,6 +49,25 @@
"""
return self.compare_field
+ def _request(self, url, throw_error=True):
+ """Request the remote tarball url.
+
+ Args:
+ url (str): Url (file or http*).
+
+ Raises:
+ ValueError in case of failing to query.
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath).
+
+ """
+ response = self.session.get(url, **self.params, stream=True)
+ if response.status_code != 200 and throw_error:
+ raise ValueError("Fail to query '%s'. Reason: %s" % (
+ url, response.status_code))
+ return response
+
def get_known_versions(self, last_snapshot):
"""
Retrieve the known release versions for the package
@@ -101,7 +120,6 @@
tarballs that are not previously archived.
"""
- # Done
versions = []
for release in tarballs:
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
--- a/swh/loader/package/loader.py
+++ b/swh/loader/package/loader.py
@@ -7,10 +7,14 @@
import os
from abc import abstractmethod
-from swh.loader.core.loader import BufferedLoader
+from tempfile import mkdtemp
+
+from swh.core import tarball
from swh.loader.core.utils import clean_dangling_folders
+from swh.loader.core.loader import BufferedLoader
+from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
from swh.storage.algos.snapshot import snapshot_get_all_branches
-from tempfile import mkdtemp
+
DEBUG_MODE = '** DEBUG MODE **'
@@ -40,6 +44,10 @@
class_name
def convert_to_standard_format
+ Optional Overrides:
+ def cleanup_artifact
+ def extract_metadata
+
"""
loader_name = None
@@ -110,6 +118,45 @@
"""
pass
+ def cleanup_artifact(self, uncompressed_path):
+ """Clean up unnecessary files from the downloaded tarball
+ also some special operation if needed.
+
+ Implementation of this method depends on the file structure of the
+ tarball. It is used to clean up files from the uncompressed tarball
+ that are not to be archived(eg binaries).
+
+ Args:
+ uncompressed_path (str): Path of uncompressed tarball
+
+ Returns:
+ uncompressed_path (str): Path of uncompressed tarball after
+ removing unnecessary files
+
+ """
+ return uncompressed_path
+
+ def extract_metadata(self, package_path, package_source_data):
+ """Fetch the metadata from the downloaded file.
+
+ Override this method to perform metadata extraction for each version
+ of a package from the uncompressed package by parsing over the file
+ containing metadata(package.json, PKG-INFO, ...).
+ Add the extracted metadata to the present `package_source_data` which
+ contains information related the focused package version.
+
+ Args:
+ package_path (str): Uncompressed package version path
+ package_source_data (dict): Information about the focused package
+ version.
+
+ Returns:
+ dict: Updated information about the focused package
+ version.
+
+ """
+ return package_source_data
+
def pre_cleanup(self):
"""To prevent disk explosion if some other workers exploded
in mid-air (OOM killed), we try and clean up dangling files.
@@ -179,3 +226,111 @@
self.origin['url'], require_snapshot=True)
if visit:
return snapshot_get_all_branches(self.storage, visit['snapshot'])
+
+ def _prepare_package_version(self, package_source_data, tarball_request):
+ """Process the package release version.
+
+ The following operations are performed:
+
+ 1. Download the tarball
+ 2. Uncompress the tarball
+ 3. Delete unnecessary files (optional)
+ 4. Parse the file associated to the package version to extract
+ metadata (optional)
+
+ Args:
+ package_source_data (dict): containing information
+ about the focused package version.
+ known_versions (dict): may be provided by the loader, it enables
+ to filter out versions already ingested in the archive.
+
+ Return:
+ Tuple[dict, str]: tuples containing the following
+ members:
+
+ * a dict holding package tarball information and metadata
+ * a string holding the path of the uncompressed package to
+ load into the archive
+
+ """
+ url = package_source_data['url']
+ tarball_path, hashes = self.download_generate_hash(tarball_request,
+ url)
+ uncompressed_path = os.path.join(self.package_temp_dir, 'uncompressed',
+ os.path.basename(url)) # SEE ME
+ package_source_data['nature'] = self.uncompress_tarball(
+ tarball_path, uncompressed_path)
+
+ # remove tarball
+ os.remove(tarball_path)
+
+ if self.tarball_invalid:
+ return None, None
+
+ package_path = self.cleanup_artifact(uncompressed_path)
+ package_source_data = self.extract_metadata(package_path,
+ package_source_data)
+ self.package_source_data.append(package_source_data)
+ return package_source_data, package_path
+
+ def download_generate_hash(self, response, url):
+ """Store file in temp directory and computes hash of its filepath.
+
+ Args:
+ response (Response): Server response of the url
+ url (str): Url of the tarball
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath)
+
+ """
+ length = int(response.headers['content-length'])
+ os.makedirs(self.package_temp_dir, exist_ok=True)
+ # SEE ME
+ filepath = os.path.join(self.package_temp_dir, os.path.basename(url))
+ h = self.write_file(filepath, length, response)
+ self.check_file(filepath, length)
+
+ hashes = {
+ 'length': length,
+ **h.hexdigest()
+ }
+ return filepath, hashes
+
+ def write_file(self, filepath, length, response):
+ """Convert the server response to a file.
+
+ """
+ h = MultiHash(length=length)
+ with open(filepath, 'wb') as f:
+ for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
+ h.update(chunk)
+ f.write(chunk)
+ return h
+
+ def check_file(self, filepath, length):
+ """Check for the validity of the tarball downloaded.
+
+ """
+ actual_length = os.path.getsize(filepath)
+ if length != actual_length:
+ raise ValueError('Error when checking size: %s != %s' % (
+ length, actual_length))
+
+ def uncompress_tarball(self, filepath, path):
+ """Uncompress a tarball.
+
+ Args:
+ filepath (str): Path of tarball to uncompress
+ path (str): The destination folder where to uncompress the tarball
+ Returns:
+ The nature of the tarball, zip or tar.
+
+ """
+ # filepath = tempdir + url
+ try:
+ self.tarball_invalid = False
+ return tarball.uncompress(filepath, path)
+ except Exception:
+ self.tarball_invalid = True
+ return None

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 23, 2:46 AM (20 h, 27 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219343

Event Timeline