diff --git a/swh/loader/package/download.py b/swh/loader/package/download.py --- a/swh/loader/package/download.py +++ b/swh/loader/package/download.py @@ -49,6 +49,25 @@ """ return self.compare_field + def _request(self, url, throw_error=True): + """Request the remote tarball url. + + Args: + url (str): Url (file or http*). + + Raises: + ValueError in case of failing to query. + + Returns: + Tuple of local (filepath, hashes of filepath). + + """ + response = self.session.get(url, **self.params, stream=True) + if response.status_code != 200 and throw_error: + raise ValueError("Fail to query '%s'. Reason: %s" % ( + url, response.status_code)) + return response + def get_known_versions(self, last_snapshot): """ Retrieve the known release versions for the package @@ -101,7 +120,6 @@ tarballs that are not previously archived. """ - # Done versions = [] for release in tarballs: diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -7,10 +7,14 @@ import os from abc import abstractmethod -from swh.loader.core.loader import BufferedLoader +from tempfile import mkdtemp + +from swh.core import tarball from swh.loader.core.utils import clean_dangling_folders +from swh.loader.core.loader import BufferedLoader +from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE from swh.storage.algos.snapshot import snapshot_get_all_branches -from tempfile import mkdtemp + DEBUG_MODE = '** DEBUG MODE **' @@ -40,6 +44,10 @@ class_name def convert_to_standard_format + Optional Overrides: + def cleanup_artifact + def extract_metadata + """ loader_name = None @@ -110,6 +118,45 @@ """ pass + def cleanup_artifact(self, uncompressed_path): + """Clean up unnecessary files from the downloaded tarball + also some special operation if needed. + + Implementation of this method depends on the file structure of the + tarball. It is used to clean up files from the uncompressed tarball + that are not to be archived(eg binaries). + + Args: + uncompressed_path (str): Path of uncompressed tarball + + Returns: + uncompressed_path (str): Path of uncompressed tarball after + removing unnecessary files + + """ + return uncompressed_path + + def extract_metadata(self, package_path, package_source_data): + """Fetch the metadata from the downloaded file. + + Override this method to perform metadata extraction for each version + of a package from the uncompressed package by parsing over the file + containing metadata(package.json, PKG-INFO, ...). + Add the extracted metadata to the present `package_source_data` which + contains information related the focused package version. + + Args: + package_path (str): Uncompressed package version path + package_source_data (dict): Information about the focused package + version. + + Returns: + dict: Updated information about the focused package + version. + + """ + return package_source_data + def pre_cleanup(self): """To prevent disk explosion if some other workers exploded in mid-air (OOM killed), we try and clean up dangling files. @@ -179,3 +226,111 @@ self.origin['url'], require_snapshot=True) if visit: return snapshot_get_all_branches(self.storage, visit['snapshot']) + + def _prepare_package_version(self, package_source_data, tarball_request): + """Process the package release version. + + The following operations are performed: + + 1. Download the tarball + 2. Uncompress the tarball + 3. Delete unnecessary files (optional) + 4. Parse the file associated to the package version to extract + metadata (optional) + + Args: + package_source_data (dict): containing information + about the focused package version. + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Return: + Tuple[dict, str]: tuples containing the following + members: + + * a dict holding package tarball information and metadata + * a string holding the path of the uncompressed package to + load into the archive + + """ + url = package_source_data['url'] + tarball_path, hashes = self.download_generate_hash(tarball_request, + url) + uncompressed_path = os.path.join(self.package_temp_dir, 'uncompressed', + os.path.basename(url)) # SEE ME + package_source_data['nature'] = self.uncompress_tarball( + tarball_path, uncompressed_path) + + # remove tarball + os.remove(tarball_path) + + if self.tarball_invalid: + return None, None + + package_path = self.cleanup_artifact(uncompressed_path) + package_source_data = self.extract_metadata(package_path, + package_source_data) + self.package_source_data.append(package_source_data) + return package_source_data, package_path + + def download_generate_hash(self, response, url): + """Store file in temp directory and computes hash of its filepath. + + Args: + response (Response): Server response of the url + url (str): Url of the tarball + + Returns: + Tuple of local (filepath, hashes of filepath) + + """ + length = int(response.headers['content-length']) + os.makedirs(self.package_temp_dir, exist_ok=True) + # SEE ME + filepath = os.path.join(self.package_temp_dir, os.path.basename(url)) + h = self.write_file(filepath, length, response) + self.check_file(filepath, length) + + hashes = { + 'length': length, + **h.hexdigest() + } + return filepath, hashes + + def write_file(self, filepath, length, response): + """Convert the server response to a file. + + """ + h = MultiHash(length=length) + with open(filepath, 'wb') as f: + for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): + h.update(chunk) + f.write(chunk) + return h + + def check_file(self, filepath, length): + """Check for the validity of the tarball downloaded. + + """ + actual_length = os.path.getsize(filepath) + if length != actual_length: + raise ValueError('Error when checking size: %s != %s' % ( + length, actual_length)) + + def uncompress_tarball(self, filepath, path): + """Uncompress a tarball. + + Args: + filepath (str): Path of tarball to uncompress + path (str): The destination folder where to uncompress the tarball + Returns: + The nature of the tarball, zip or tar. + + """ + # filepath = tempdir + url + try: + self.tarball_invalid = False + return tarball.uncompress(filepath, path) + except Exception: + self.tarball_invalid = True + return None