diff --git a/swh/loader/base/__init__.py b/swh/loader/base/__init__.py new file mode 100644 diff --git a/swh/loader/base/abstractattribute.py b/swh/loader/base/abstractattribute.py new file mode 100644 --- /dev/null +++ b/swh/loader/base/abstractattribute.py @@ -0,0 +1,26 @@ +# Copyright (C) 2017 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +class AbstractAttribute: + """AbstractAttributes in a base class must be overridden by the subclass. + + It's like the :func:`abc.abstractmethod` decorator, but for things that + are explicitly attributes/properties, not methods, without the need for + empty method def boilerplate. Like abc.abstractmethod, the class containing + AbstractAttributes must inherit from :class:`abc.ABC` or use the + :class:`abc.ABCMeta` metaclass. + + Usage example:: + + import abc + class ClassContainingAnAbstractAttribute(abc.ABC): + foo = AbstractAttribute('descriptive docstring for foo') + + """ + __isabstractmethod__ = True + + def __init__(self, docstring=None): + if docstring is not None: + self.__doc__ = 'AbstractAttribute: ' + docstring diff --git a/swh/loader/base/dowload.py b/swh/loader/base/dowload.py new file mode 100644 --- /dev/null +++ b/swh/loader/base/dowload.py @@ -0,0 +1,130 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import requests +from .abstractattribute import AbstractAttribute + +try: + from _version import __version__ +except ImportError: + __version__ = 'devel' + + +# This file contains methods to check and remove archived package version +# and download the new package version. + + +class if_modified_then: + """Uses if_modified_then header to check for archived packages + + + """ + def __init__(self): + self.session = requests.session() + self.params = { + 'headers': { + 'User-Agent': 'Software Heritage Tar Loader (%s)' % ( + __version__ + ) + } + } + + def download(self, url): + """Download the remote tarball. + + Args: + url (str): Url (file or http*) + + Raises: + ValueError in case of failing to query + + Returns: + Tuple of local (filepath, hashes of filepath) + + """ + response = self.session.get(url, **self.params, stream=True) + if response.status_code != 200 and response.status_code != 304: + raise ValueError("Fail to query '%s'. Reason: %s" % ( + url, response.status_code)) + if response.status_code != 304: + return self.generate_hash(response, url) + return None + + def filter_release_artifact(): + pass + + +class compare_field: + """Uses a field present in the metadata to check for archived packages. + + """ + + compare_field = AbstractAttribute("Field used to identify if the package" + "version is previously archived") + # eg for pypi loader compare_field = 'sha' + + def __init__(self): + self.session = requests.session() + self.params = { + 'headers': { + 'User-Agent': 'Software Heritage Tar Loader (%s)' % ( + __version__ + ) + } + } + + def download_new_releases(): + pass + + def download(self, url): + """Download the remote tarball. + + Args: + url (str): Url (file or http*) + + Raises: + ValueError in case of failing to query + + Returns: + Tuple of local (filepath, hashes of filepath) + + """ + response = self.session.get(url, **self.params, stream=True) + if response.status_code != 200: + raise ValueError("Fail to query '%s'. Reason: %s" % ( + url, response.status_code)) + return self.generate_hash(response, url) # method present in loader.py + + def filter_release_artifacts(): + pass + + +''' +This is implemented in loader but called here +def generate_hash(self, response): + """Store file in temp directory and generate its hash + + """ + length = int(response.headers['content-length']) + + filepath = os.path.join(self.temp_directory, os.path.basename(url)) + + h = MultiHash(length=length) + with open(filepath, 'wb') as f: + for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): + h.update(chunk) + f.write(chunk) + + actual_length = os.path.getsize(filepath) + if length != actual_length: + raise ValueError('Error when checking size: %s != %s' % ( + length, actual_length)) + + hashes = { + 'length': length, + **h.hexdigest() + } + return filepath, hashes +''' diff --git a/swh/loader/base/loader.py b/swh/loader/base/loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/base/loader.py @@ -0,0 +1,207 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import shutil + +from abc import abstractmethod + +from .abstractattribute import AbstractAttribute +from swh.loader.core.utils import clean_dangling_folders +from swh.loader.core.loader import BufferedLoader +from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE +from swh.storage.algos.snapshot import snapshot_get_all_branches + +DEBUG_MODE = '** DEBUG MODE **' + + +class BaseLoader(BufferedLoader): + """ + + Required Overrides: + loader_name + class_name + def convert_to_standard_forma + + Optional Overrides: + def cleanup_artifact + def extract_metadata + + """ + + loader_name = AbstractAttribute("Name of the package manager") # e.g pypi + class_name = AbstractAttribute("Name of the loader class") # eg PyPILoader + + def __init__(self): + super().__init__(logging_class='swh.loader.%s.%s' % (self.loader_name, + self.class_name)) + self.TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.%s.' % self.loader_name + self.CONFIG_BASE_FILENAME = 'loader/%s' % self.loader_name + + self.ADDITIONAL_CONFIG = { + 'temp_directory': ('str', + '/tmp/swh.loader.%s/' % self.loader_name), + 'cache': ('bool', False), + 'cache_dir': ('str', ''), + 'debug': ('bool', False), # NOT FOR PRODUCTION + } + pass + + @abstractmethod + def convert_to_standard_format(): + """ + + """ + pass + + def cleanup_artifact(): + """ + + """ + pass + + def extract_metadata(self): + """Fetch the metadata from the downloaded file + + """ + pass + + # You probably don't need to override anything below this line. + + def prepare_origin_visit(self): + """ + + """ + pass + + def prepare(self): + """ + + """ + pass + + def known_artifact(self): + """ + + """ + pass + + def fetch_data(self): + """ + + """ + pass + + def last_snapshot(self): + """Retrieve the last snapshot of the package if any. + + """ + # Done + visit = self.storage.origin_visit_get_latest( + self.origin['url'], require_snapshot=True) + if visit: + return snapshot_get_all_branches(self.storage, visit['snapshot']) + + def store_data(self): + """Store fetched data in the database. + + """ + # Done + self.maybe_load_contents(self.package_contents) + self.maybe_load_directories(self.package_directories) + self.maybe_load_revisions(self.package_revisions) + + if self.done: + self._generate_and_load_snapshot() + self.flush() + + def generate_hash(self, response, url): + """Store file in temp directory and computes hash of its filepath + + Args: + response (Response): Server response of the url + url (str): Url of the tarball + + Returns: + Tuple of local (filepath, hashes of filepath) + + """ + # Done + length = int(response.headers['content-length']) + + filepath = os.path.join(self.temp_directory, os.path.basename(url)) + + h = MultiHash(length=length) + with open(filepath, 'wb') as f: + for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): + h.update(chunk) + f.write(chunk) + + actual_length = os.path.getsize(filepath) + if length != actual_length: + raise ValueError('Error when checking size: %s != %s' % ( + length, actual_length)) + + hashes = { + 'length': length, + **h.hexdigest() + } + return filepath, hashes + + def pre_cleanup(self): + """To prevent disk explosion if some other workers exploded + in mid-air (OOM killed), we try and clean up dangling files. + + """ + # Done + if self.debug: + self.log.warn('%s Will not pre-clean up temp dir %s' % ( + DEBUG_MODE, self.temp_directory + )) + return + clean_dangling_folders(self.config['temp_directory'], + pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN, + log=self.log) + + def flush(self): + """Flush any potential dangling data not sent to swh-storage. + + Bypass the maybe_load_* methods which awaits threshold reached + signal. We actually want to store those as we are done + loading. + + """ + # Done + contents = self.contents.pop() + directories = self.directories.pop() + revisions = self.revisions.pop() + releases = self.releases.pop() + + # and send those to storage if asked + if self.config['send_contents']: + self.send_batch_contents(contents) + if self.config['send_contents']: + self.send_batch_directories(directories) + if self.config['send_revisions']: + self.send_batch_revisions(revisions) + if self.config['send_releases']: + self.send_batch_releases(releases) + if self.config['send_snapshot'] and self.snapshot: + self.send_snapshot(self.snapshot) + + def cleanup(self): + """Clean up temporary disk use after downloading and extracting + package tarballs. + + """ + # Done + if self.debug: + self.log.warn('%s Will not clean up temp dir %s' % ( + DEBUG_MODE, self.temp_directory + )) + return + if os.path.exists(self.temp_directory): + self.log.debug('Clean up %s' % self.temp_directory) + shutil.rmtree(self.temp_directory) diff --git a/swh/loader/base/tests/__init__.py b/swh/loader/base/tests/__init__.py new file mode 100644 diff --git a/swh/loader/base/tests/test_download.py b/swh/loader/base/tests/test_download.py new file mode 100644 diff --git a/swh/loader/base/tests/test_loader.py b/swh/loader/base/tests/test_loader.py new file mode 100644