Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7124202
D1694.id5688.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
11 KB
Subscribers
None
D1694.id5688.diff
View Options
diff --git a/swh/loader/base/__init__.py b/swh/loader/base/__init__.py
new file mode 100644
diff --git a/swh/loader/base/abstractattribute.py b/swh/loader/base/abstractattribute.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/base/abstractattribute.py
@@ -0,0 +1,26 @@
+# Copyright (C) 2017 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+class AbstractAttribute:
+ """AbstractAttributes in a base class must be overridden by the subclass.
+
+ It's like the :func:`abc.abstractmethod` decorator, but for things that
+ are explicitly attributes/properties, not methods, without the need for
+ empty method def boilerplate. Like abc.abstractmethod, the class containing
+ AbstractAttributes must inherit from :class:`abc.ABC` or use the
+ :class:`abc.ABCMeta` metaclass.
+
+ Usage example::
+
+ import abc
+ class ClassContainingAnAbstractAttribute(abc.ABC):
+ foo = AbstractAttribute('descriptive docstring for foo')
+
+ """
+ __isabstractmethod__ = True
+
+ def __init__(self, docstring=None):
+ if docstring is not None:
+ self.__doc__ = 'AbstractAttribute: ' + docstring
diff --git a/swh/loader/base/dowload.py b/swh/loader/base/dowload.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/base/dowload.py
@@ -0,0 +1,130 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import requests
+from .abstractattribute import AbstractAttribute
+
+try:
+ from _version import __version__
+except ImportError:
+ __version__ = 'devel'
+
+
+# This file contains methods to check and remove archived package version
+# and download the new package version.
+
+
+class if_modified_then:
+ """Uses if_modified_then header to check for archived packages
+
+
+ """
+ def __init__(self):
+ self.session = requests.session()
+ self.params = {
+ 'headers': {
+ 'User-Agent': 'Software Heritage Tar Loader (%s)' % (
+ __version__
+ )
+ }
+ }
+
+ def download(self, url):
+ """Download the remote tarball.
+
+ Args:
+ url (str): Url (file or http*)
+
+ Raises:
+ ValueError in case of failing to query
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath)
+
+ """
+ response = self.session.get(url, **self.params, stream=True)
+ if response.status_code != 200 and response.status_code != 304:
+ raise ValueError("Fail to query '%s'. Reason: %s" % (
+ url, response.status_code))
+ if response.status_code != 304:
+ return self.generate_hash(response, url)
+ return None
+
+ def filter_release_artifact():
+ pass
+
+
+class compare_field:
+ """Uses a field present in the metadata to check for archived packages.
+
+ """
+
+ compare_field = AbstractAttribute("Field used to identify if the package"
+ "version is previously archived")
+ # eg for pypi loader compare_field = 'sha'
+
+ def __init__(self):
+ self.session = requests.session()
+ self.params = {
+ 'headers': {
+ 'User-Agent': 'Software Heritage Tar Loader (%s)' % (
+ __version__
+ )
+ }
+ }
+
+ def download_new_releases():
+ pass
+
+ def download(self, url):
+ """Download the remote tarball.
+
+ Args:
+ url (str): Url (file or http*)
+
+ Raises:
+ ValueError in case of failing to query
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath)
+
+ """
+ response = self.session.get(url, **self.params, stream=True)
+ if response.status_code != 200:
+ raise ValueError("Fail to query '%s'. Reason: %s" % (
+ url, response.status_code))
+ return self.generate_hash(response, url) # method present in loader.py
+
+ def filter_release_artifacts():
+ pass
+
+
+'''
+This is implemented in loader but called here
+def generate_hash(self, response):
+ """Store file in temp directory and generate its hash
+
+ """
+ length = int(response.headers['content-length'])
+
+ filepath = os.path.join(self.temp_directory, os.path.basename(url))
+
+ h = MultiHash(length=length)
+ with open(filepath, 'wb') as f:
+ for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
+ h.update(chunk)
+ f.write(chunk)
+
+ actual_length = os.path.getsize(filepath)
+ if length != actual_length:
+ raise ValueError('Error when checking size: %s != %s' % (
+ length, actual_length))
+
+ hashes = {
+ 'length': length,
+ **h.hexdigest()
+ }
+ return filepath, hashes
+'''
diff --git a/swh/loader/base/loader.py b/swh/loader/base/loader.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/base/loader.py
@@ -0,0 +1,207 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import shutil
+
+from abc import abstractmethod
+
+from .abstractattribute import AbstractAttribute
+from swh.loader.core.utils import clean_dangling_folders
+from swh.loader.core.loader import BufferedLoader
+from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
+from swh.storage.algos.snapshot import snapshot_get_all_branches
+
+DEBUG_MODE = '** DEBUG MODE **'
+
+
+class BaseLoader(BufferedLoader):
+ """
+
+ Required Overrides:
+ loader_name
+ class_name
+ def convert_to_standard_forma
+
+ Optional Overrides:
+ def cleanup_artifact
+ def extract_metadata
+
+ """
+
+ loader_name = AbstractAttribute("Name of the package manager") # e.g pypi
+ class_name = AbstractAttribute("Name of the loader class") # eg PyPILoader
+
+ def __init__(self):
+ super().__init__(logging_class='swh.loader.%s.%s' % (self.loader_name,
+ self.class_name))
+ self.TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.%s.' % self.loader_name
+ self.CONFIG_BASE_FILENAME = 'loader/%s' % self.loader_name
+
+ self.ADDITIONAL_CONFIG = {
+ 'temp_directory': ('str',
+ '/tmp/swh.loader.%s/' % self.loader_name),
+ 'cache': ('bool', False),
+ 'cache_dir': ('str', ''),
+ 'debug': ('bool', False), # NOT FOR PRODUCTION
+ }
+ pass
+
+ @abstractmethod
+ def convert_to_standard_format():
+ """
+
+ """
+ pass
+
+ def cleanup_artifact():
+ """
+
+ """
+ pass
+
+ def extract_metadata(self):
+ """Fetch the metadata from the downloaded file
+
+ """
+ pass
+
+ # You probably don't need to override anything below this line.
+
+ def prepare_origin_visit(self):
+ """
+
+ """
+ pass
+
+ def prepare(self):
+ """
+
+ """
+ pass
+
+ def known_artifact(self):
+ """
+
+ """
+ pass
+
+ def fetch_data(self):
+ """
+
+ """
+ pass
+
+ def last_snapshot(self):
+ """Retrieve the last snapshot of the package if any.
+
+ """
+ # Done
+ visit = self.storage.origin_visit_get_latest(
+ self.origin['url'], require_snapshot=True)
+ if visit:
+ return snapshot_get_all_branches(self.storage, visit['snapshot'])
+
+ def store_data(self):
+ """Store fetched data in the database.
+
+ """
+ # Done
+ self.maybe_load_contents(self.package_contents)
+ self.maybe_load_directories(self.package_directories)
+ self.maybe_load_revisions(self.package_revisions)
+
+ if self.done:
+ self._generate_and_load_snapshot()
+ self.flush()
+
+ def generate_hash(self, response, url):
+ """Store file in temp directory and computes hash of its filepath
+
+ Args:
+ response (Response): Server response of the url
+ url (str): Url of the tarball
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath)
+
+ """
+ # Done
+ length = int(response.headers['content-length'])
+
+ filepath = os.path.join(self.temp_directory, os.path.basename(url))
+
+ h = MultiHash(length=length)
+ with open(filepath, 'wb') as f:
+ for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
+ h.update(chunk)
+ f.write(chunk)
+
+ actual_length = os.path.getsize(filepath)
+ if length != actual_length:
+ raise ValueError('Error when checking size: %s != %s' % (
+ length, actual_length))
+
+ hashes = {
+ 'length': length,
+ **h.hexdigest()
+ }
+ return filepath, hashes
+
+ def pre_cleanup(self):
+ """To prevent disk explosion if some other workers exploded
+ in mid-air (OOM killed), we try and clean up dangling files.
+
+ """
+ # Done
+ if self.debug:
+ self.log.warn('%s Will not pre-clean up temp dir %s' % (
+ DEBUG_MODE, self.temp_directory
+ ))
+ return
+ clean_dangling_folders(self.config['temp_directory'],
+ pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN,
+ log=self.log)
+
+ def flush(self):
+ """Flush any potential dangling data not sent to swh-storage.
+
+ Bypass the maybe_load_* methods which awaits threshold reached
+ signal. We actually want to store those as we are done
+ loading.
+
+ """
+ # Done
+ contents = self.contents.pop()
+ directories = self.directories.pop()
+ revisions = self.revisions.pop()
+ releases = self.releases.pop()
+
+ # and send those to storage if asked
+ if self.config['send_contents']:
+ self.send_batch_contents(contents)
+ if self.config['send_contents']:
+ self.send_batch_directories(directories)
+ if self.config['send_revisions']:
+ self.send_batch_revisions(revisions)
+ if self.config['send_releases']:
+ self.send_batch_releases(releases)
+ if self.config['send_snapshot'] and self.snapshot:
+ self.send_snapshot(self.snapshot)
+
+ def cleanup(self):
+ """Clean up temporary disk use after downloading and extracting
+ package tarballs.
+
+ """
+ # Done
+ if self.debug:
+ self.log.warn('%s Will not clean up temp dir %s' % (
+ DEBUG_MODE, self.temp_directory
+ ))
+ return
+ if os.path.exists(self.temp_directory):
+ self.log.debug('Clean up %s' % self.temp_directory)
+ shutil.rmtree(self.temp_directory)
diff --git a/swh/loader/base/tests/__init__.py b/swh/loader/base/tests/__init__.py
new file mode 100644
diff --git a/swh/loader/base/tests/test_download.py b/swh/loader/base/tests/test_download.py
new file mode 100644
diff --git a/swh/loader/base/tests/test_loader.py b/swh/loader/base/tests/test_loader.py
new file mode 100644
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 21 2024, 12:10 AM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225265
Attached To
D1694: swh.loader.package: Implement a method to prepare package visit
Event Timeline
Log In to Comment