Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7124173
D1694.id5687.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
11 KB
Subscribers
None
D1694.id5687.diff
View Options
diff --git a/swh/loader/base/__init__.py b/swh/loader/base/__init__.py
new file mode 100644
diff --git a/swh/loader/base/abstractattribute.py b/swh/loader/base/abstractattribute.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/base/abstractattribute.py
@@ -0,0 +1,26 @@
+# Copyright (C) 2017 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+class AbstractAttribute:
+ """AbstractAttributes in a base class must be overridden by the subclass.
+
+ It's like the :func:`abc.abstractmethod` decorator, but for things that
+ are explicitly attributes/properties, not methods, without the need for
+ empty method def boilerplate. Like abc.abstractmethod, the class containing
+ AbstractAttributes must inherit from :class:`abc.ABC` or use the
+ :class:`abc.ABCMeta` metaclass.
+
+ Usage example::
+
+ import abc
+ class ClassContainingAnAbstractAttribute(abc.ABC):
+ foo = AbstractAttribute('descriptive docstring for foo')
+
+ """
+ __isabstractmethod__ = True
+
+ def __init__(self, docstring=None):
+ if docstring is not None:
+ self.__doc__ = 'AbstractAttribute: ' + docstring
diff --git a/swh/loader/base/dowload.py b/swh/loader/base/dowload.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/base/dowload.py
@@ -0,0 +1,166 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import requests
+from .abstractattribute import AbstractAttribute
+
+try:
+ from _version import __version__
+except ImportError:
+ __version__ = 'devel'
+
+
+# This wil check and download and then sent it to hash fuction
+
+'''
+class ArchiveFetcher:
+ """Http/Local client in charge of downloading archives from a
+ remote/local server.
+
+ Args:
+ temp_directory (str): Path to the temporary disk location used
+ for downloading the release artifacts
+
+ """
+ def __init__(self):
+ self.session = requests.session()
+ self.params = {
+ 'headers': {
+ 'User-Agent': 'Software Heritage Tar Loader (%s)' % (
+ __version__
+ )
+ }
+ }
+
+ def download(self, url):
+ """Download the remote tarball url locally.
+
+ Args:
+ url (str): Url (file or http*)
+
+ Raises:
+ ValueError in case of failing to query
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath)
+
+ """
+ response = self.session.get(url, **self.params, stream=True)
+ if response.status_code != 200:
+ raise ValueError("Fail to query '%s'. Reason: %s" % (
+ url, response.status_code))
+'''
+
+
+class if_modified_then:
+ """
+
+
+ """
+ def __init__(self):
+ self.session = requests.session()
+ self.params = {
+ 'headers': {
+ 'User-Agent': 'Software Heritage Tar Loader (%s)' % (
+ __version__
+ )
+ }
+ }
+
+ def make_request(self, url):
+ """Request the remote tarball url.
+
+ Args:
+ url (str): Url (file or http*)
+
+ Raises:
+ ValueError in case of failing to query
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath)
+
+ """
+ response = self.session.get(url, **self.params, stream=True)
+ if response.status_code != 200:
+ raise ValueError("Fail to query '%s'. Reason: %s" % (
+ url, response.status_code))
+ return self.generate_hash(response, url)
+
+ def filter_release_artifact():
+ pass
+
+
+class compare_field:
+ """
+
+ """
+
+ compare_field = AbstractAttribute("Field used to identify if the package"
+ "version is previously archived")
+ # eg for pypi loader compare_field = 'sha'
+
+ def __init__(self):
+ self.session = requests.session()
+ self.params = {
+ 'headers': {
+ 'User-Agent': 'Software Heritage Tar Loader (%s)' % (
+ __version__
+ )
+ }
+ }
+
+ def download_new_releases():
+ pass
+
+ def make_request(self, url):
+ """Request the remote tarball url locally.
+
+ Args:
+ url (str): Url (file or http*)
+
+ Raises:
+ ValueError in case of failing to query
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath)
+
+ """
+ response = self.session.get(url, **self.params, stream=True)
+ if response.status_code != 200 and response.status_code != 304:
+ raise ValueError("Fail to query '%s'. Reason: %s" % (
+ url, response.status_code))
+ return self.generate_hash(response, url)
+
+ def filter_release_artifacts():
+ pass
+
+
+'''
+This is implemented in loader but called here
+def generate_hash(self, response):
+ """Store file in temp directory and generate its hash
+
+ """
+ length = int(response.headers['content-length'])
+
+ filepath = os.path.join(self.temp_directory, os.path.basename(url))
+
+ h = MultiHash(length=length)
+ with open(filepath, 'wb') as f:
+ for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
+ h.update(chunk)
+ f.write(chunk)
+
+ actual_length = os.path.getsize(filepath)
+ if length != actual_length:
+ raise ValueError('Error when checking size: %s != %s' % (
+ length, actual_length))
+
+ hashes = {
+ 'length': length,
+ **h.hexdigest()
+ }
+ return filepath, hashes
+'''
diff --git a/swh/loader/base/loader.py b/swh/loader/base/loader.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/base/loader.py
@@ -0,0 +1,186 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import shutil
+
+from abc import abstractmethod
+
+from .abstractattribute import AbstractAttribute
+from swh.loader.core.utils import clean_dangling_folders
+from swh.loader.core.loader import BufferedLoader
+from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
+from swh.storage.algos.snapshot import snapshot_get_all_branches
+
+DEBUG_MODE = '** DEBUG MODE **'
+
+
+class BaseLoader(BufferedLoader):
+
+ loader_name = AbstractAttribute("Name of the package manager") # e.g pypi
+ class_name = AbstractAttribute("Name of the loader class")
+
+ def __init__(self):
+ super().__init__(logging_class='swh.loader.%s.%s' % (self.loader_name,
+ self.class_name))
+ self.TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.%s.' % self.loader_name
+ self.CONFIG_BASE_FILENAME = 'loader/%s' % self.loader_name
+
+ self.ADDITIONAL_CONFIG = {
+ 'temp_directory': ('str',
+ '/tmp/swh.loader.%s/' % self.loader_name),
+ 'cache': ('bool', False),
+ 'cache_dir': ('str', ''),
+ 'debug': ('bool', False), # NOT FOR PRODUCTION
+ }
+ pass
+
+ @abstractmethod
+ def convert_to_standard_format():
+ """
+
+ """
+ pass
+
+ def cleanup_artifact():
+ """
+
+ """
+ pass
+
+ def extract_metadata(self):
+ """
+
+ """
+ pass
+
+ def prepare_origin_visit(self):
+ """
+
+ """
+ pass
+
+ def prepare(self):
+ """
+
+ """
+ pass
+
+ def known_artifact(self):
+ """
+
+ """
+ pass
+
+ def fetch_data(self):
+ """
+
+ """
+ pass
+
+ def last_snapshot(self):
+ """Retrieve the last snapshot of the package if any.
+
+ """
+ # Done
+ visit = self.storage.origin_visit_get_latest(
+ self.origin['url'], require_snapshot=True)
+ if visit:
+ return snapshot_get_all_branches(self.storage, visit['snapshot'])
+
+ def store_data(self):
+ """Store fetched data in the database.
+
+ """
+ # Done
+ self.maybe_load_contents(self.package_contents)
+ self.maybe_load_directories(self.package_directories)
+ self.maybe_load_revisions(self.package_revisions)
+
+ if self.done:
+ self._generate_and_load_snapshot()
+ self.flush()
+
+ def generate_hash(self, response, url):
+ """Store file in temp directory and generate its hash
+
+ """
+ # Done Comment remaining
+ length = int(response.headers['content-length'])
+
+ filepath = os.path.join(self.temp_directory, os.path.basename(url))
+
+ h = MultiHash(length=length)
+ with open(filepath, 'wb') as f:
+ for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
+ h.update(chunk)
+ f.write(chunk)
+
+ actual_length = os.path.getsize(filepath)
+ if length != actual_length:
+ raise ValueError('Error when checking size: %s != %s' % (
+ length, actual_length))
+
+ hashes = {
+ 'length': length,
+ **h.hexdigest()
+ }
+ return filepath, hashes
+
+ def pre_cleanup(self):
+ """To prevent disk explosion if some other workers exploded
+ in mid-air (OOM killed), we try and clean up dangling files.
+
+ """
+ # Done
+ if self.debug:
+ self.log.warn('%s Will not pre-clean up temp dir %s' % (
+ DEBUG_MODE, self.temp_directory
+ ))
+ return
+ clean_dangling_folders(self.config['temp_directory'],
+ pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN,
+ log=self.log)
+
+ def flush(self):
+ """Flush any potential dangling data not sent to swh-storage.
+
+ Bypass the maybe_load_* methods which awaits threshold reached
+ signal. We actually want to store those as we are done
+ loading.
+
+ """
+ # Done
+ contents = self.contents.pop()
+ directories = self.directories.pop()
+ revisions = self.revisions.pop()
+ releases = self.releases.pop()
+
+ # and send those to storage if asked
+ if self.config['send_contents']:
+ self.send_batch_contents(contents)
+ if self.config['send_contents']:
+ self.send_batch_directories(directories)
+ if self.config['send_revisions']:
+ self.send_batch_revisions(revisions)
+ if self.config['send_releases']:
+ self.send_batch_releases(releases)
+ if self.config['send_snapshot'] and self.snapshot:
+ self.send_snapshot(self.snapshot)
+
+ def cleanup(self):
+ """Clean up temporary disk use after downloading and extracting
+ npm source package tarballs.
+
+ """
+ # Done
+ if self.debug:
+ self.log.warn('%s Will not clean up temp dir %s' % (
+ DEBUG_MODE, self.temp_directory
+ ))
+ return
+ if os.path.exists(self.temp_directory):
+ self.log.debug('Clean up %s' % self.temp_directory)
+ shutil.rmtree(self.temp_directory)
diff --git a/swh/loader/base/tests/__init__.py b/swh/loader/base/tests/__init__.py
new file mode 100644
diff --git a/swh/loader/base/tests/test_download.py b/swh/loader/base/tests/test_download.py
new file mode 100644
diff --git a/swh/loader/base/tests/test_loader.py b/swh/loader/base/tests/test_loader.py
new file mode 100644
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 20 2024, 9:43 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218202
Attached To
D1694: swh.loader.package: Implement a method to prepare package visit
Event Timeline
Log In to Comment