D1694.id5780.diff
No OneTemporary
Actions

Size

29 KB

Subscribers

None

D1694.id5780.diff
View Options

	diff --git a/swh/loader/base/__init__.py b/swh/loader/base/__init__.py
	new file mode 100644
	diff --git a/swh/loader/base/abstractattribute.py b/swh/loader/base/abstractattribute.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/loader/base/abstractattribute.py
	@@ -0,0 +1,26 @@
	+# Copyright (C) 2019 the Software Heritage developers
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+
	+class AbstractAttribute:
	+ """AbstractAttributes in a base class must be overridden by the subclass.
	+
	+ It's like the :func:`abc.abstractmethod` decorator, but for things that
	+ are explicitly attributes/properties, not methods, without the need for
	+ empty method def boilerplate. Like abc.abstractmethod, the class containing
	+ AbstractAttributes must inherit from :class:`abc.ABC` or use the
	+ :class:`abc.ABCMeta` metaclass.
	+
	+ Usage example::
	+
	+ import abc
	+ class ClassContainingAnAbstractAttribute(abc.ABC):
	+ foo = AbstractAttribute('descriptive docstring for foo')
	+
	+ """
	+ __isabstractmethod__ = True
	+
	+ def __init__(self, docstring=None):
	+ if docstring is not None:
	+ self.__doc__ = 'AbstractAttribute: ' + docstring
	diff --git a/swh/loader/base/build.py b/swh/loader/base/build.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/loader/base/build.py
	@@ -0,0 +1,110 @@
	+# Copyright (C) 2019 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+
	+import copy
	+import logging
	+import os
	+
	+import arrow
	+
	+
	+logger = logging.getLogger(__name__)
	+
	+# I AM WORK IN PROGRESS DON'T SEE ME
	+
	+# Static setup
	+EPOCH = 0
	+UTC_OFFSET = 0
	+SWH_PERSON = {
	+ 'name': 'Software Heritage',
	+ 'fullname': 'Software Heritage',
	+ 'email': 'robot@softwareheritage.org'
	+}
	+REVISION_MESSAGE = 'swh-loader-tar: synthetic revision message'
	+REVISION_TYPE = 'tar'
	+
	+
	+def _time_from_last_modified(last_modified):
	+ """Compute the modification time from the tarpath.
	+
	+ Args:
	+ last_modified (str): Last modification time
	+
	+ Returns:
	+ dict representing a timestamp with keys {seconds, microseconds}
	+
	+ """
	+ last_modified = arrow.get(last_modified)
	+ mtime = last_modified.float_timestamp
	+ normalized_time = list(map(int, str(mtime).split('.')))
	+ return {
	+ 'seconds': normalized_time[0],
	+ 'microseconds': normalized_time[1]
	+ }
	+
	+
	+def compute_revision(tarpath, last_modified):
	+ """Compute a revision.
	+
	+ Args:
	+ tarpath (str): absolute path to the tarball
	+ last_modified (str): Time of last modification read from the
	+ source remote (most probably by the lister)
	+
	+ Returns:
	+ Revision as dict:
	+ - date (dict): the modification timestamp as returned by
	+ _time_from_path function
	+ - committer_date: the modification timestamp as returned by
	+ _time_from_path function
	+ - author: cf. SWH_PERSON
	+ - committer: cf. SWH_PERSON
	+ - type: cf. REVISION_TYPE
	+ - message: cf. REVISION_MESSAGE
	+
	+ """
	+ ts = _time_from_last_modified(last_modified)
	+
	+ return {
	+ 'date': {
	+ 'timestamp': ts,
	+ 'offset': UTC_OFFSET,
	+ },
	+ 'committer_date': {
	+ 'timestamp': ts,
	+ 'offset': UTC_OFFSET,
	+ },
	+ 'author': SWH_PERSON,
	+ 'committer': SWH_PERSON,
	+ 'type': REVISION_TYPE,
	+ 'message': REVISION_MESSAGE,
	+ 'synthetic': True,
	+ }
	+
	+
	+def set_original_artifact(*, revision, filepath, nature, hashes):
	+ """Set the original artifact data on the given revision for
	+ the tarball currently being loaded."""
	+
	+ revision = copy.deepcopy(revision)
	+ if 'metadata' not in revision or not revision['metadata']:
	+ revision['metadata'] = {}
	+ if 'original_artifact' in revision['metadata']:
	+ oa = revision['metadata']['original_artifact']
	+ if oa:
	+ logger.warning(
	+ 'Revision already contains original_artifact metadata, '
	+ 'replacing: %r',
	+ oa,
	+ )
	+
	+ revision['metadata']['original_artifact'] = [{
	+ 'name': os.path.basename(filepath),
	+ 'archive_type': nature,
	+ **hashes,
	+ }]
	+
	+ return revision
	diff --git a/swh/loader/base/dowload.py b/swh/loader/base/dowload.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/loader/base/dowload.py
	@@ -0,0 +1,308 @@
	+# Copyright (C) 2019 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+import requests
	+from .abstractattribute import AbstractAttribute
	+
	+try:
	+ from _version import __version__
	+except ImportError:
	+ __version__ = 'devel'
	+
	+
	+# This file contains methods to check and remove archived package version
	+# and download the new package version.
	+
	+
	+class If_Modified_Since:
	+ """Uses if_modified_then header to check for archived packages
	+
	+
	+ """
	+ def __init__(self):
	+ self.session = requests.session()
	+ self.params = {
	+ 'headers': {
	+ 'User-Agent': 'Software Heritage Tar Loader (%s)' % (
	+ __version__
	+ )
	+ }
	+ }
	+
	+ def known_artifacts(self, last_snapshot):
	+ """
	+ Retrieve the known release versions for the npm package
	+ (i.e. those already ingested into the archive).
	+
	+ Args
	+ last_snapshot (dict): Last snapshot for the visit
	+
	+ Returns:
	+ dict: Dict whose keys are Tuple[filename, sha1] and values
	+ are revision ids.
	+
	+ """
	+ if not last_snapshot or 'branches' not in last_snapshot:
	+ return {}
	+
	+ # retrieve only revisions (e.g the alias we do not want here)
	+ revs = [rev['target']
	+ for rev in last_snapshot['branches'].values()
	+ if rev and rev['target_type'] == 'revision']
	+ known_revisions = self.storage.revision_get(revs)
	+ ret = {}
	+ for revision in known_revisions:
	+ if not revision: # revision_get can return None
	+ continue
	+ if 'original_artifact' in revision['metadata']: # Fix me
	+ artifact = revision['metadata']['original_artifact']
	+ ret[artifact['url']] = [revision['id'],
	+ artifact['time_last_visit']]
	+ return ret
	+
	+ def filter_package_versions(self, tarballs, known_versions):
	+ """
	+ Return the available versions for the focused package.
	+
	+ Args:
	+ tarballs (list): may be provided by the loader, it enables
	+ to filter out versions already ingested in the archive.
	+
	+ Returns:
	+ dict: A dict whose keys are Tuple[version, tarball_sha1] and
	+ values dicts with the following entries:
	+
	+ * name: the package name
	+ * version: the package version
	+ * filename: the package source tarball filename
	+ * sha1: the package source tarball sha1 checksum
	+ * date: the package release date
	+ * url: the package source tarball download url
	+ """
	+ # Done version is artifact
	+ versions = []
	+
	+ for release in tarballs:
	+ if release['url'] in known_versions:
	+
	+ tarball_url = release['url']
	+ tarball_request = self._request(
	+ tarball_url,
	+ time_last_visit=known_versions[tarball_url][1],
	+ throw_error=False)
	+
	+ if tarball_request.status_code == 304:
	+ continue
	+
	+ elif tarball_request.status_code == 404:
	+ self.log.debug('Tarball url %s returns a 404 error.',
	+ tarball_url)
	+ continue
	+ release['response'] = tarball_request
	+ versions.append(release)
	+
	+ return versions
	+
	+ def _request(self, url, time_last_visit, throw_error=True):
	+ """Request the remote tarball url.
	+
	+ Args:
	+ url (str): Url (file or http*)
	+
	+ Raises:
	+ ValueError in case of failing to query
	+
	+ Returns:
	+ server response
	+
	+ """
	+ self.params['headers']['If-Modified-Since'] = time_last_visit
	+ response = self.session.get(url, **self.params, stream=True)
	+ if response.status_code != 200:
	+ raise ValueError("Fail to query '%s'. Reason: %s" % (
	+ url, response.status_code))
	+ return response
	+
	+ def prepare_package_versions(self, known_versions=None):
	+ """
	+ Instantiate a generator that will process a specific package released
	+ version at each iteration step. The following operations will be
	+ performed:
	+
	+ 1. Create a temporary directory to download and extract the
	+ release tarball
	+ 2. Download the tarball
	+ 3. Check downloaded tarball integrity
	+ 4. Uncompress the tarball
	+ 5. Parse ``package.json`` file associated to the package version
	+ 6. Extract author from the parsed ``package.json`` file
	+
	+ Args:
	+ known_versions (dict): may be provided by the loader, it enables
	+ to filter out versions already ingested in the archive.
	+
	+ Yields:
	+ Tuple[dict, dict, dict, str]: tuples containing the following
	+ members:
	+
	+ * a dict holding the parsed ``package.json`` file
	+ * a dict holding package author information
	+ * a dict holding package tarball information
	+ * a string holding the path of the uncompressed package to
	+ load into the archive
	+
	+ """
	+ new_versions = self.filter_package_versions(known_versions)
	+ for package_source_data in new_versions:
	+ tarball_request = package_source_data['response']
	+
	+ # To make things simple while creating revisions
	+ del package_source_data['response']
	+ yield self._prepare_package_version(package_source_data,
	+ tarball_request)
	+
	+
	+class compare_field:
	+ """Uses a field present in the metadata to check for archived packages.
	+
	+ """
	+
	+ compare_field = AbstractAttribute("Field used to identify if the package"
	+ "version is previously archived")
	+ # eg for pypi loader compare_field = 'sha'
	+
	+ def __init__(self):
	+ self.session = requests.session()
	+ self.params = {
	+ 'headers': {
	+ 'User-Agent': 'Software Heritage Tar Loader (%s)' % (
	+ __version__
	+ )
	+ }
	+ }
	+
	+ def _request(self, url, throw_error=True):
	+ """Request the remote tarball url.
	+
	+ Args:
	+ url (str): Url (file or http*)
	+
	+ Raises:
	+ ValueError in case of failing to query
	+
	+ Returns:
	+ Tuple of local (filepath, hashes of filepath)
	+
	+ """
	+ response = self.session.get(url, **self.params, stream=True)
	+ if response.status_code != 200 and throw_error:
	+ raise ValueError("Fail to query '%s'. Reason: %s" % (
	+ url, response.status_code))
	+ return response
	+
	+ def known_artifacts(self, last_snapshot):
	+ """
	+ Retrieve the known release versions for the npm package
	+ (i.e. those already ingested into the archive).
	+
	+ Args
	+ last_snapshot (dict): Last snapshot for the visit
	+
	+ Returns:
	+ dict: Dict whose keys are Tuple[filename, sha1] and values
	+ are revision ids.
	+
	+ """
	+ if not last_snapshot or 'branches' not in last_snapshot:
	+ return {}
	+
	+ # retrieve only revisions (e.g the alias we do not want here)
	+ revs = [rev['target']
	+ for rev in last_snapshot['branches'].values()
	+ if rev and rev['target_type'] == 'revision']
	+ known_revisions = self.storage.revision_get(revs)
	+ ret = {}
	+ for revision in known_revisions:
	+ if not revision: # revision_get can return None
	+ continue
	+ if 'original_artifact' in revision['metadata']: # Fix me
	+ artifact = revision['metadata']['original_artifact']
	+ ret[artifact[self.compare_field]] = revision['id']
	+ return ret
	+
	+ def filter_package_versions(self, tarballs, known_versions):
	+ """
	+ Return the available versions for the focused package.
	+
	+ Args:
	+ tarballs (list): may be provided by the loader, it enables
	+ to filter out versions already ingested in the archive.
	+
	+ Returns:
	+ dict: A dict whose keys are Tuple[version, tarball_sha1] and
	+ values dicts with the following entries:
	+
	+ * name: the package name
	+ * version: the package version
	+ * filename: the package source tarball filename
	+ * sha1: the package source tarball sha1 checksum
	+ * date: the package release date
	+ * url: the package source tarball download url
	+ """
	+ # Done version is artifact
	+ versions = []
	+
	+ for release in tarballs:
	+ if release[self.compare_field] in known_versions:
	+ continue
	+ versions.append(release)
	+
	+ return versions
	+
	+ def prepare_package_versions(self, tarballs, known_versions=None):
	+ """
	+ Instantiate a generator that will process a specific package released
	+ version at each iteration step. The following operations will be
	+ performed:
	+
	+ 1. Create a temporary directory to download and extract the
	+ release tarball
	+ 2. Download the tarball
	+ 3. Check downloaded tarball integrity
	+ 4. Uncompress the tarball
	+ 5. Parse ``package.json`` file associated to the package version
	+ 6. Extract author from the parsed ``package.json`` file
	+
	+ Args:
	+ known_versions (dict): may be provided by the loader, it enables
	+ to filter out versions already ingested in the archive.
	+
	+ Yields:
	+ Tuple[dict, dict, dict, str]: tuples containing the following
	+ members:
	+
	+ * a dict holding the parsed ``package.json`` file
	+ * a dict holding package author information
	+ * a dict holding package tarball information
	+ * a string holding the path of the uncompressed package to
	+ load into the archive
	+
	+ """
	+ new_versions = self.filter_package_versions(tarballs, known_versions)
	+ for package_source_data in new_versions:
	+ # filter out version with missing tarball,
	+ # package visit will be marked as partial at the end of
	+ # the loading process
	+
	+ tarball_url = package_source_data['url']
	+ tarball_request = self._request(tarball_url,
	+ throw_error=False)
	+ if tarball_request.status_code == 404:
	+ self.log.debug('Tarball url %s returns a 404 error.',
	+ tarball_url)
	+ continue
	+
	+ yield self._prepare_package_version(package_source_data,
	+ tarball_request)
	diff --git a/swh/loader/base/loader.py b/swh/loader/base/loader.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/loader/base/loader.py
	@@ -0,0 +1,415 @@
	+# Copyright (C) 2019 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+import os
	+import shutil
	+
	+from abc import abstractmethod
	+from tempfile import mkdtemp
	+
	+from swh.core import tarball
	+from .abstractattribute import AbstractAttribute
	+from swh.loader.core.utils import clean_dangling_folders
	+from swh.loader.core.loader import BufferedLoader
	+from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
	+from swh.storage.algos.snapshot import snapshot_get_all_branches
	+from swh.model.from_disk import Directory
	+
	+
	+from .build import compute_revision, set_original_artifact
	+
	+from swh.model.identifiers import (
	+ identifier_to_bytes, revision_identifier
	+)
	+
	+DEBUG_MODE = ' DEBUG MODE '
	+
	+
	+class BaseLoader(BufferedLoader):
	+ """
	+
	+ Required Overrides:
	+ loader_name
	+ class_name
	+ def convert_to_standard_forma
	+
	+ Optional Overrides:
	+ def cleanup_artifact
	+ def extract_metadata
	+
	+ """
	+
	+ loader_name = AbstractAttribute("Name of the package manager") # e.g pypi
	+ class_name = AbstractAttribute("Name of the loader class") # eg PyPILoader
	+
	+ def __init__(self):
	+ super().__init__(logging_class='swh.loader.%s.%s' % (self.loader_name,
	+ self.class_name))
	+ self.TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.%s.' % self.loader_name
	+ self.CONFIG_BASE_FILENAME = 'loader/%s' % self.loader_name
	+
	+ self.ADDITIONAL_CONFIG = {
	+ 'temp_directory': ('str',
	+ '/tmp/swh.loader.%s/' % self.loader_name),
	+ 'cache': ('bool', False),
	+ 'cache_dir': ('str', ''),
	+ 'debug': ('bool', False), # NOT FOR PRODUCTION
	+ }
	+
	+ self.local_cache = None
	+ self.dir_path = None
	+
	+ temp_directory = self.config['temp_directory']
	+ os.makedirs(temp_directory, exist_ok=True)
	+
	+ self.temp_directory = mkdtemp(
	+ suffix='-%s' % os.getpid(),
	+ prefix=self.TEMPORARY_DIR_PREFIX_PATTERN,
	+ dir=temp_directory)
	+
	+ self.debug = self.config.get('debug', False)
	+
	+ @abstractmethod
	+ def convert_to_standard_format():
	+ """
	+ name:
	+ origin_url:
	+ tarballs:[
	+ {
	+ url: must
	+ },
	+ {
	+ },
	+ ]
	+
	+ No nature or
	+ response in the case of if modified since
	+ """
	+ pass
	+
	+ def cleanup_artifact():
	+ """Clean up unnecessary files from the downloaded tarball
	+ also some special operation if needed
	+ """
	+ pass
	+
	+ def extract_metadata(self):
	+ """Fetch the metadata from the downloaded file
	+
	+ """
	+ pass
	+
	+ # You probably don't need to override anything below this line.
	+
	+ def prepare_origin_visit(self, **kwargs):
	+ """
	+ Prepare npm package visit.
	+
	+ Args:
	+ package_name (str): the name of the npm package
	+ package_url (str): the url of the package description
	+ package_metadata_url (str): the url for the package JSON metadata
	+
	+ """
	+ # reset statuses
	+ self._load_status = 'uneventful'
	+ self._visit_status = 'full'
	+ self.done = False
	+ # fetch the npm package metadata from the registry
	+ self.package_details = self.convert_to_standard_format(**kwargs)
	+ self.origin = {
	+ 'url': self.package_details['origin_url'],
	+ 'type': self.loader_name,
	+ }
	+ self.visit_date = None # loader core will populate it
	+
	+ def prepare(self, **kwargs):
	+ """
	+ Prepare effective loading of source tarballs for a package manager
	+ package.
	+
	+ Args:
	+ package_name (str): the name of the npm package
	+ package_url (str): the url of the package description
	+ package_metadata_url (str): the url for the package JSON metadata
	+ """
	+
	+ self.contents = []
	+ self.directories = []
	+ self.revisions = []
	+ self.package_temp_dir = os.path.join(self.temp_directory,
	+ self.package_details['name'])
	+
	+ last_snapshot = self.last_snapshot()
	+ self.known_artifacts = self.known_artifacts(last_snapshot)
	+
	+ self.new_artifacts = \
	+ self.prepare_package_versions(self.package_details['tarballs'],
	+ self.known_artifacts)
	+
	+ def _prepare_package_version(self, package_source_data, tarball_request):
	+ """
	+ Return
	+ a dict of all the info
	+ """
	+ url = package_source_data['url']
	+ tarball_path, hashes = self.generate_hash(tarball_request, url)
	+ uncompressed_path = os.path.join(self.package_temp_dir, 'uncompressed',
	+ url) # SEE ME
	+ package_source_data['nature'] = self.uncompress_tarball(
	+ tarball_path, uncompressed_path)
	+
	+ # remove tarball
	+ os.remove(tarball_path)
	+
	+ if self.tarball_invalid:
	+ return None, None
	+
	+ package_path = self.cleanup_artifact(uncompressed_path) # also some
	+ # special operation if needed
	+ package_source_data = self.extract_metadata(package_path,
	+ package_source_data)
	+ return package_source_data, package_path
	+
	+ def fetch_data(self):
	+ """Called once per release artifact version (can be many for one
	+ release).
	+
	+ This will for each call:
	+ - retrieve a release artifact (associated to a release version)
	+ - Uncompress it and compute the necessary information
	+ - Computes the swh objects
	+
	+ Returns:
	+ True as long as data to fetch exist
	+
	+ """
	+ data = None
	+ if self.done:
	+ return False
	+
	+ try:
	+ data = next(self.new_artifacts)
	+ self._load_status = 'eventful'
	+ except StopIteration:
	+ self.done = True
	+ return False
	+
	+ package_source_data, dir_path = data
	+ # package release tarball was corrupted
	+ if self.tarball_invalid:
	+ return not self.done
	+
	+ dir_path = dir_path.encode('utf-8')
	+ directory = Directory.from_disk(path=dir_path, data=True)
	+ objects = directory.collect()
	+
	+ if 'content' not in objects:
	+ objects['content'] = {}
	+ if 'directory' not in objects:
	+ objects['directory'] = {}
	+
	+ self.contents = objects['content'].values()
	+ self.directories = objects['directory'].values()
	+
	+ '''
	+ useless
	+ date = normalize_timestamp(
	+ int(arrow.get(artifact['date']).timestamp))
	+
	+ name = release['name'].encode('utf-8')
	+ message = release['message'].encode('utf-8')
	+ if message:
	+ message = b'%s: %s' % (name, message)
	+ else:
	+ message = name
	+ '''
	+ filepath = [] # FIX ME
	+ nature = []
	+ hashes = []
	+ revision = self.build_revision(filepath, nature, hashes) # FIX ME
	+
	+ '''
	+ revision = {
	+ 'synthetic': True, # ok
	+ 'metadata': {
	+ 'original_artifact': artifact,
	+ 'project': project_info,
	+ },
	+ 'author': author, # ok
	+ 'date': date, # ok
	+ 'committer': author, # ok
	+ 'committer_date': date, # ok
	+ 'message': message, # ok
	+ 'directory': directory.hash,
	+ 'parents': [], # why is this needed
	+ 'type': 'tar', # ok
	+ }
	+ '''
	+
	+ revision['id'] = identifier_to_bytes(
	+ revision_identifier(revision))
	+ self.revisions.append(revision)
	+
	+ # Change me to be compatable with if modified since
	+ package_key = package_source_data[self.compare_field] # check for this
	+ self.known_artifacts[package_key] = revision['id'] # SEE ME
	+
	+ self.log.debug('Removing unpacked package files at %s', dir_path)
	+ shutil.rmtree(dir_path)
	+
	+ return not self.done
	+
	+ def build_revision(self, filepath, nature, hashes):
	+ """Build the revision with identifier
	+
	+ We use the `last_modified` date provided by the caller to
	+ build the revision.
	+
	+ """
	+ return set_original_artifact(
	+ revision=compute_revision(filepath, self.last_modified),
	+ filepath=filepath,
	+ nature=nature,
	+ hashes=hashes,
	+ )
	+
	+ def last_snapshot(self):
	+ """Retrieve the last snapshot of the package if any.
	+
	+ """
	+ # Done
	+ visit = self.storage.origin_visit_get_latest(
	+ self.origin['url'], require_snapshot=True)
	+ if visit:
	+ return snapshot_get_all_branches(self.storage, visit['snapshot'])
	+
	+ def store_data(self):
	+ """Store fetched data in the database.
	+
	+ """
	+ # Done
	+ self.maybe_load_contents(self.contents)
	+ self.maybe_load_directories(self.directories)
	+ self.maybe_load_revisions(self.revisions)
	+
	+ if self.done:
	+ self.generate_and_load_snapshot()
	+ self.flush()
	+
	+ def generate_and_load_snapshot(self):
	+ """
	+ Make me
	+ """
	+ pass
	+
	+ def generate_hash(self, response, url):
	+ """Store file in temp directory and computes hash of its filepath
	+
	+ Args:
	+ response (Response): Server response of the url
	+ url (str): Url of the tarball
	+
	+ Returns:
	+ Tuple of local (filepath, hashes of filepath)
	+
	+ """
	+ # Done Update docstring
	+ length = int(response.headers['content-length'])
	+
	+ # SEE ME
	+ filepath = os.path.join(self.package_temp_dir, os.path.basename(url))
	+
	+ h = MultiHash(length=length)
	+ with open(filepath, 'wb') as f:
	+ for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
	+ h.update(chunk)
	+ f.write(chunk)
	+
	+ actual_length = os.path.getsize(filepath)
	+ if length != actual_length:
	+ raise ValueError('Error when checking size: %s != %s' % (
	+ length, actual_length))
	+
	+ hashes = {
	+ 'length': length,
	+ **h.hexdigest()
	+ }
	+ return filepath, hashes
	+
	+ def uncompress_tarball(self, filepath, path):
	+ """Uncompress a tarball
	+
	+ Args:
	+ filepath (str): Path of tarball to uncompress
	+ path (str): The destination folder where to uncompress the tarball
	+ Returns:
	+ The nature of the tarball, zip or tar.
	+
	+ """
	+ # Done
	+ # filepath = tempdir + url
	+ try:
	+ self.tarball_invalid = False
	+ return tarball.uncompress(filepath, path)
	+ except Exception:
	+ self.tarball_invalid = True
	+ return None
	+
	+ def pre_cleanup(self):
	+ """To prevent disk explosion if some other workers exploded
	+ in mid-air (OOM killed), we try and clean up dangling files.
	+
	+ """
	+ # Done
	+ if self.debug:
	+ self.log.warn('%s Will not pre-clean up temp dir %s' % (
	+ DEBUG_MODE, self.temp_directory
	+ ))
	+ return
	+ clean_dangling_folders(self.config['temp_directory'],
	+ pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN,
	+ log=self.log)
	+
	+ def flush(self):
	+ """Flush any potential dangling data not sent to swh-storage.
	+
	+ Bypass the maybe_load_* methods which awaits threshold reached
	+ signal. We actually want to store those as we are done
	+ loading.
	+
	+ """
	+ # Done
	+ contents = self.contents.pop()
	+ directories = self.directories.pop()
	+ revisions = self.revisions.pop()
	+ releases = self.releases.pop()
	+
	+ # and send those to storage if asked
	+ if self.config['send_contents']:
	+ self.send_batch_contents(contents)
	+ if self.config['send_contents']:
	+ self.send_batch_directories(directories)
	+ if self.config['send_revisions']:
	+ self.send_batch_revisions(revisions)
	+ if self.config['send_releases']:
	+ self.send_batch_releases(releases)
	+ if self.config['send_snapshot'] and self.snapshot:
	+ self.send_snapshot(self.snapshot)
	+
	+ def cleanup(self):
	+ """Clean up temporary disk use after downloading and extracting
	+ package tarballs.
	+
	+ """
	+ # Done
	+ if self.debug:
	+ self.log.warn('%s Will not clean up temp dir %s' % (
	+ DEBUG_MODE, self.temp_directory
	+ ))
	+ return
	+ if os.path.exists(self.temp_directory):
	+ self.log.debug('Clean up %s' % self.temp_directory)
	+ shutil.rmtree(self.temp_directory)
	diff --git a/swh/loader/base/tests/__init__.py b/swh/loader/base/tests/__init__.py
	new file mode 100644
	diff --git a/swh/loader/base/tests/test_download.py b/swh/loader/base/tests/test_download.py
	new file mode 100644
	diff --git a/swh/loader/base/tests/test_loader.py b/swh/loader/base/tests/test_loader.py
	new file mode 100644

File Metadata

Mime Type: text/plain
Expires: Dec 20 2024, 11:57 PM (11 w, 4 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3226114

D1694.id5780.diffNo OneTemporaryActions

D1694.id5780.diffView Options

File Metadata

Event Timeline

D1694.id5780.diff
No OneTemporary
Actions

D1694.id5780.diff
View Options