Page MenuHomeSoftware Heritage

D1694.id5687.diff
No OneTemporary

D1694.id5687.diff

diff --git a/swh/loader/base/__init__.py b/swh/loader/base/__init__.py
new file mode 100644
diff --git a/swh/loader/base/abstractattribute.py b/swh/loader/base/abstractattribute.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/base/abstractattribute.py
@@ -0,0 +1,26 @@
+# Copyright (C) 2017 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+class AbstractAttribute:
+ """AbstractAttributes in a base class must be overridden by the subclass.
+
+ It's like the :func:`abc.abstractmethod` decorator, but for things that
+ are explicitly attributes/properties, not methods, without the need for
+ empty method def boilerplate. Like abc.abstractmethod, the class containing
+ AbstractAttributes must inherit from :class:`abc.ABC` or use the
+ :class:`abc.ABCMeta` metaclass.
+
+ Usage example::
+
+ import abc
+ class ClassContainingAnAbstractAttribute(abc.ABC):
+ foo = AbstractAttribute('descriptive docstring for foo')
+
+ """
+ __isabstractmethod__ = True
+
+ def __init__(self, docstring=None):
+ if docstring is not None:
+ self.__doc__ = 'AbstractAttribute: ' + docstring
diff --git a/swh/loader/base/dowload.py b/swh/loader/base/dowload.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/base/dowload.py
@@ -0,0 +1,166 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import requests
+from .abstractattribute import AbstractAttribute
+
+try:
+ from _version import __version__
+except ImportError:
+ __version__ = 'devel'
+
+
+# This wil check and download and then sent it to hash fuction
+
+'''
+class ArchiveFetcher:
+ """Http/Local client in charge of downloading archives from a
+ remote/local server.
+
+ Args:
+ temp_directory (str): Path to the temporary disk location used
+ for downloading the release artifacts
+
+ """
+ def __init__(self):
+ self.session = requests.session()
+ self.params = {
+ 'headers': {
+ 'User-Agent': 'Software Heritage Tar Loader (%s)' % (
+ __version__
+ )
+ }
+ }
+
+ def download(self, url):
+ """Download the remote tarball url locally.
+
+ Args:
+ url (str): Url (file or http*)
+
+ Raises:
+ ValueError in case of failing to query
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath)
+
+ """
+ response = self.session.get(url, **self.params, stream=True)
+ if response.status_code != 200:
+ raise ValueError("Fail to query '%s'. Reason: %s" % (
+ url, response.status_code))
+'''
+
+
+class if_modified_then:
+ """
+
+
+ """
+ def __init__(self):
+ self.session = requests.session()
+ self.params = {
+ 'headers': {
+ 'User-Agent': 'Software Heritage Tar Loader (%s)' % (
+ __version__
+ )
+ }
+ }
+
+ def make_request(self, url):
+ """Request the remote tarball url.
+
+ Args:
+ url (str): Url (file or http*)
+
+ Raises:
+ ValueError in case of failing to query
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath)
+
+ """
+ response = self.session.get(url, **self.params, stream=True)
+ if response.status_code != 200:
+ raise ValueError("Fail to query '%s'. Reason: %s" % (
+ url, response.status_code))
+ return self.generate_hash(response, url)
+
+ def filter_release_artifact():
+ pass
+
+
+class compare_field:
+ """
+
+ """
+
+ compare_field = AbstractAttribute("Field used to identify if the package"
+ "version is previously archived")
+ # eg for pypi loader compare_field = 'sha'
+
+ def __init__(self):
+ self.session = requests.session()
+ self.params = {
+ 'headers': {
+ 'User-Agent': 'Software Heritage Tar Loader (%s)' % (
+ __version__
+ )
+ }
+ }
+
+ def download_new_releases():
+ pass
+
+ def make_request(self, url):
+ """Request the remote tarball url locally.
+
+ Args:
+ url (str): Url (file or http*)
+
+ Raises:
+ ValueError in case of failing to query
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath)
+
+ """
+ response = self.session.get(url, **self.params, stream=True)
+ if response.status_code != 200 and response.status_code != 304:
+ raise ValueError("Fail to query '%s'. Reason: %s" % (
+ url, response.status_code))
+ return self.generate_hash(response, url)
+
+ def filter_release_artifacts():
+ pass
+
+
+'''
+This is implemented in loader but called here
+def generate_hash(self, response):
+ """Store file in temp directory and generate its hash
+
+ """
+ length = int(response.headers['content-length'])
+
+ filepath = os.path.join(self.temp_directory, os.path.basename(url))
+
+ h = MultiHash(length=length)
+ with open(filepath, 'wb') as f:
+ for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
+ h.update(chunk)
+ f.write(chunk)
+
+ actual_length = os.path.getsize(filepath)
+ if length != actual_length:
+ raise ValueError('Error when checking size: %s != %s' % (
+ length, actual_length))
+
+ hashes = {
+ 'length': length,
+ **h.hexdigest()
+ }
+ return filepath, hashes
+'''
diff --git a/swh/loader/base/loader.py b/swh/loader/base/loader.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/base/loader.py
@@ -0,0 +1,186 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import shutil
+
+from abc import abstractmethod
+
+from .abstractattribute import AbstractAttribute
+from swh.loader.core.utils import clean_dangling_folders
+from swh.loader.core.loader import BufferedLoader
+from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
+from swh.storage.algos.snapshot import snapshot_get_all_branches
+
+DEBUG_MODE = '** DEBUG MODE **'
+
+
+class BaseLoader(BufferedLoader):
+
+ loader_name = AbstractAttribute("Name of the package manager") # e.g pypi
+ class_name = AbstractAttribute("Name of the loader class")
+
+ def __init__(self):
+ super().__init__(logging_class='swh.loader.%s.%s' % (self.loader_name,
+ self.class_name))
+ self.TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.%s.' % self.loader_name
+ self.CONFIG_BASE_FILENAME = 'loader/%s' % self.loader_name
+
+ self.ADDITIONAL_CONFIG = {
+ 'temp_directory': ('str',
+ '/tmp/swh.loader.%s/' % self.loader_name),
+ 'cache': ('bool', False),
+ 'cache_dir': ('str', ''),
+ 'debug': ('bool', False), # NOT FOR PRODUCTION
+ }
+ pass
+
+ @abstractmethod
+ def convert_to_standard_format():
+ """
+
+ """
+ pass
+
+ def cleanup_artifact():
+ """
+
+ """
+ pass
+
+ def extract_metadata(self):
+ """
+
+ """
+ pass
+
+ def prepare_origin_visit(self):
+ """
+
+ """
+ pass
+
+ def prepare(self):
+ """
+
+ """
+ pass
+
+ def known_artifact(self):
+ """
+
+ """
+ pass
+
+ def fetch_data(self):
+ """
+
+ """
+ pass
+
+ def last_snapshot(self):
+ """Retrieve the last snapshot of the package if any.
+
+ """
+ # Done
+ visit = self.storage.origin_visit_get_latest(
+ self.origin['url'], require_snapshot=True)
+ if visit:
+ return snapshot_get_all_branches(self.storage, visit['snapshot'])
+
+ def store_data(self):
+ """Store fetched data in the database.
+
+ """
+ # Done
+ self.maybe_load_contents(self.package_contents)
+ self.maybe_load_directories(self.package_directories)
+ self.maybe_load_revisions(self.package_revisions)
+
+ if self.done:
+ self._generate_and_load_snapshot()
+ self.flush()
+
+ def generate_hash(self, response, url):
+ """Store file in temp directory and generate its hash
+
+ """
+ # Done Comment remaining
+ length = int(response.headers['content-length'])
+
+ filepath = os.path.join(self.temp_directory, os.path.basename(url))
+
+ h = MultiHash(length=length)
+ with open(filepath, 'wb') as f:
+ for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
+ h.update(chunk)
+ f.write(chunk)
+
+ actual_length = os.path.getsize(filepath)
+ if length != actual_length:
+ raise ValueError('Error when checking size: %s != %s' % (
+ length, actual_length))
+
+ hashes = {
+ 'length': length,
+ **h.hexdigest()
+ }
+ return filepath, hashes
+
+ def pre_cleanup(self):
+ """To prevent disk explosion if some other workers exploded
+ in mid-air (OOM killed), we try and clean up dangling files.
+
+ """
+ # Done
+ if self.debug:
+ self.log.warn('%s Will not pre-clean up temp dir %s' % (
+ DEBUG_MODE, self.temp_directory
+ ))
+ return
+ clean_dangling_folders(self.config['temp_directory'],
+ pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN,
+ log=self.log)
+
+ def flush(self):
+ """Flush any potential dangling data not sent to swh-storage.
+
+ Bypass the maybe_load_* methods which awaits threshold reached
+ signal. We actually want to store those as we are done
+ loading.
+
+ """
+ # Done
+ contents = self.contents.pop()
+ directories = self.directories.pop()
+ revisions = self.revisions.pop()
+ releases = self.releases.pop()
+
+ # and send those to storage if asked
+ if self.config['send_contents']:
+ self.send_batch_contents(contents)
+ if self.config['send_contents']:
+ self.send_batch_directories(directories)
+ if self.config['send_revisions']:
+ self.send_batch_revisions(revisions)
+ if self.config['send_releases']:
+ self.send_batch_releases(releases)
+ if self.config['send_snapshot'] and self.snapshot:
+ self.send_snapshot(self.snapshot)
+
+ def cleanup(self):
+ """Clean up temporary disk use after downloading and extracting
+ npm source package tarballs.
+
+ """
+ # Done
+ if self.debug:
+ self.log.warn('%s Will not clean up temp dir %s' % (
+ DEBUG_MODE, self.temp_directory
+ ))
+ return
+ if os.path.exists(self.temp_directory):
+ self.log.debug('Clean up %s' % self.temp_directory)
+ shutil.rmtree(self.temp_directory)
diff --git a/swh/loader/base/tests/__init__.py b/swh/loader/base/tests/__init__.py
new file mode 100644
diff --git a/swh/loader/base/tests/test_download.py b/swh/loader/base/tests/test_download.py
new file mode 100644
diff --git a/swh/loader/base/tests/test_loader.py b/swh/loader/base/tests/test_loader.py
new file mode 100644

File Metadata

Mime Type
text/plain
Expires
Dec 20 2024, 9:43 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218202

Event Timeline