diff --git a/swh/loader/base/__init__.py b/swh/loader/base/__init__.py new file mode 100644 diff --git a/swh/loader/base/abstractattribute.py b/swh/loader/base/abstractattribute.py new file mode 100644 --- /dev/null +++ b/swh/loader/base/abstractattribute.py @@ -0,0 +1,26 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +class AbstractAttribute: + """AbstractAttributes in a base class must be overridden by the subclass. + + It's like the :func:`abc.abstractmethod` decorator, but for things that + are explicitly attributes/properties, not methods, without the need for + empty method def boilerplate. Like abc.abstractmethod, the class containing + AbstractAttributes must inherit from :class:`abc.ABC` or use the + :class:`abc.ABCMeta` metaclass. + + Usage example:: + + import abc + class ClassContainingAnAbstractAttribute(abc.ABC): + foo = AbstractAttribute('descriptive docstring for foo') + + """ + __isabstractmethod__ = True + + def __init__(self, docstring=None): + if docstring is not None: + self.__doc__ = 'AbstractAttribute: ' + docstring diff --git a/swh/loader/base/build.py b/swh/loader/base/build.py new file mode 100644 --- /dev/null +++ b/swh/loader/base/build.py @@ -0,0 +1,119 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import arrow +import time +from swh.model.identifiers import normalize_timestamp +from dateutil import parser as date_parser + + +class construct_revision: + """Construct revision from the metadata of the package version + + Construct the revision for a package version using the metadata provided. + There are several hookpoints that can be overridden as per the need of + package manager. + + """ + + UTC_OFFSET = 0 + SWH_PERSON = { + 'name': 'Software Heritage', + 'fullname': 'Software Heritage', + 'email': 'robot@softwareheritage.org' + } + REVISION_MESSAGE = 'swh-loader-base: synthetic revision message' + + def modify_revision(self, revision): + """Make modification on revision created + If the revision of a package manager needs to be in a specific format, + this method can be overridden to perform that operation insted of + overriding the `compute_revision()`. + + Args: + revision (dict): Created revision + + Returns: + dict: Modified revision + """ + return revision + + def _time_from_last_modified(self, last_modified): + """Compute the modification time from the tarpath. + + Args: + last_modified (str): Last modification time + + Returns: + dict representing a timestamp with keys {seconds, microseconds} + + """ + last_modified = arrow.get(last_modified) + mtime = last_modified.float_timestamp + normalized_time = list(map(int, str(mtime).split('.'))) + return { + 'seconds': normalized_time[0], + 'microseconds': normalized_time[1] + } + + def compute_revision(self, directory, package_source_data): + """Compute a revision. + + Args: + directory (str): absolute path to the tarball + package_source_data (dict): Information about the package + release version + + Returns: + dict: Revision + + """ + revision = { + 'metadata': self.find_metadata(package_source_data), + 'date': self.find_date(package_source_data), + 'committer_date': self.find_date(package_source_data), + 'author': self.find_author(package_source_data), + 'committer': self.find_author(package_source_data), + 'type': self.find_type(package_source_data), + 'message': self.find_message(package_source_data), + 'directory': self.directory(directory), + 'synthetic': self.find_synthetic(), + } + + return self.modify_revision(revision) + + def find_synthetic(self): + return True + + def find_type(self, package_source_data): + return package_source_data['nature'] + + def find_message(self, package_source_data): + return self.REVISION_MESSAGE + + def directory(self, directory): + return directory.hash + + def find_author(self, package_source_data): + if 'author' in package_source_data: + return package_source_data['author'] + return self.SWH_PERSON + + def find_metadata(self, package_source_data): + return { + 'package': package_source_data + } + + def find_date(self, package_source_data): + date = date_parser.parse(package_source_data['date']) + date = normalize_timestamp(int(date.timestamp())) + if 'date' in package_source_data: + date = date_parser.parse(package_source_data['date']) + date = normalize_timestamp(int(date.timestamp())) + return date + timestamp = time.time() + timestamp = time.strftime('%a, %d %b %Y %H:%M:%S GMT', + time.gmtime(timestamp)) + return self._time_from_last_modified(timestamp) diff --git a/swh/loader/base/dowload.py b/swh/loader/base/dowload.py new file mode 100644 --- /dev/null +++ b/swh/loader/base/dowload.py @@ -0,0 +1,354 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import requests +import time +from .abstractattribute import AbstractAttribute + +try: + from _version import __version__ +except ImportError: + __version__ = 'devel' + + +# This file contains methods to check and remove archived package version + + +class If_Modified_Since: + """Uses if_modified_then header to check for if the package is previously + archived. + + This class is to be used to identify and avoid the reprocessing of + previously archived package version when there is no reliable field is + provided in the metadata that can so the job. + + The following operations are performed: + - Retrive known versions and store then in a dict with tarball `url` + as key(can be changed by overriding `get_key` method) + - Check if the tarballs are present in knowns versions. + * If the match found, it sends a request with `if_modified_since` + header to confirm the match + * If the match is not found it sends a simple request + - Store the request and the time for further processing + - Instantiate a generator to process a specific package released + version + + """ + def __init__(self): + self.session = requests.session() + self.params = { + 'headers': { + 'User-Agent': 'Software Heritage Tar Loader (%s)' % ( + __version__ + ) + } + } + + def get_artifact(self, revision): + """Fetch artifact form revision + + Args: + revision (dict): Previous revision + + Returns: + dict: metadata present in the revision + + """ + return revision['metadata']['package'] + + def get_key(self): + """Returns the key to be used to identify known revisions + + """ + return 'url' + + def known_versions(self, last_snapshot): + """ + Retrieve the known release versions for the npm package + (i.e. those already ingested into the archive). + + Args + last_snapshot (dict): Last snapshot for the visit + + Returns: + dict: Dict whose key is url and values are revision ids. + + """ + if not last_snapshot or 'branches' not in last_snapshot: + return {} + + # retrieve only revisions (e.g the alias we do not want here) + revs = [rev['target'] + for rev in last_snapshot['branches'].values() + if rev and rev['target_type'] == 'revision'] + known_revisions = self.storage.revision_get(revs) + ret = {} + key = self.get_key() + for revision in known_revisions: + if not revision: # revision_get can return None + continue + + artifact = self.get_artifact(revision) + ret[artifact[key]] = [revision['id'], + artifact['time_last_visit']] + return ret + + def filter_package_versions(self, tarballs, known_versions): + """Return the available tarballs that are not previously archived. + + Args: + tarballs (list): a list of dicts containing information about the + respective tarball that is provided by lister. + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Returns: + A list of dicts containing information about the respective + tarballs that are not previously archived. + + """ + # Done + versions = [] + key = self.get_key() + for release in tarballs: + tarball_url = release['url'] + + if release[key] in known_versions: + tarball_request = self._request( + tarball_url, + time_last_visit=known_versions[tarball_url][1], + throw_error=False) + else: + tarball_request = self._request( + tarball_url, time_last_visit=None, throw_error=False) + + if tarball_request.status_code == 304: + continue + + elif tarball_request.status_code == 404: + self.log.debug('Tarball url %s returns a 404 error.', + tarball_url) + continue + + release['response'] = tarball_request + timestamp = time.time() + timestamp = time.strftime('%a, %d %b %Y %H:%M:%S GMT', + time.gmtime(timestamp)) + release['time_last_visit'] = timestamp + versions.append(release) + + return versions + + def _request(self, url, time_last_visit=None, throw_error=True): + """Request the remote tarball url. + + Args: + url (str): Url (file or http*) + + Raises: + ValueError in case of failing to query + + Returns: + server response + + """ + # Done + if time_last_visit: + self.params['headers']['If-Modified-Since'] = time_last_visit + response = self.session.get(url, **self.params, stream=True) + if response.status_code != 200: + raise ValueError("Fail to query '%s'. Reason: %s" % ( + url, response.status_code)) + return response + + def prepare_package_versions(self, tarballs, known_versions=None): + """ + Instantiate a generator that will process a specific package released + version at each iteration step. The following operations will be + performed: + + 1. Create a temporary directory to download and extract the + release tarball + 2. Download the tarball + 3. Uncompress the tarball + 4. Parse the file associated to the package version to extract + metadata (optional) + 5. Delete unnecessary files (optional) + + Args: + tarballs (list): a list of dicts containing information about the + respective tarball that is provided by lister. + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Yields: + Tuple[dict, str]: tuples containing the following + members: + + * a dict holding package tarball information and metadata + * a string holding the path of the uncompressed package to + load into the archive + + """ + new_versions = self.filter_package_versions(tarballs, known_versions) + for package_source_data in new_versions: + tarball_request = package_source_data['response'] + + # To make things simple while creating revisions + del package_source_data['response'] + yield self._prepare_package_version(package_source_data, + tarball_request) + + +class compare_field: + """Uses a field present in the metadata to check for if the package is + previously archived. + + This class is to be used to identify and avoid the reprocessing of + previously archived package version using a field provided by the + API in the metadata of the package version + + The following operations are performed: + - Retrive known versions and store then in a dict with key of same + field that is mentioned in compare field + - Check if the tarballs are present in knowns versions. + - Instantiate a generator to process a specific package released + version + + """ + compare_field = AbstractAttribute("Field used to identify if the package" + "version is previously archived") + # eg for pypi loader compare_field = 'sha' + + def __init__(self): + self.session = requests.session() + self.params = { + 'headers': { + 'User-Agent': 'Software Heritage Tar Loader (%s)' % ( + __version__ + ) + } + } + + def _request(self, url, throw_error=True): + """Request the remote tarball url. + + Args: + url (str): Url (file or http*) + + Raises: + ValueError in case of failing to query + + Returns: + Tuple of local (filepath, hashes of filepath) + + """ + # Done + response = self.session.get(url, **self.params, stream=True) + if response.status_code != 200 and throw_error: + raise ValueError("Fail to query '%s'. Reason: %s" % ( + url, response.status_code)) + return response + + def known_versions(self, last_snapshot): + """ + Retrieve the known release versions for the npm package + (i.e. those already ingested into the archive). + + Args + last_snapshot (dict): Last snapshot for the visit + + Returns: + dict: Dict whose key is the value of field chosen for + checking archived artifacts and values are revision ids. + + """ + if not last_snapshot or 'branches' not in last_snapshot: + return {} + + # retrieve only revisions (e.g the alias we do not want here) + revs = [rev['target'] + for rev in last_snapshot['branches'].values() + if rev and rev['target_type'] == 'revision'] + known_revisions = self.storage.revision_get(revs) + ret = {} + for revision in known_revisions: + if not revision: # revision_get can return None + continue + if 'original_artifact' in revision['metadata']: # Fix me + artifact = revision['metadata']['original_artifact'] + ret[artifact[self.compare_field]] = revision['id'] # Check me + return ret + + def filter_package_versions(self, tarballs, known_versions): + """ + Return the available tarballs that are not previously archived. + + Args: + tarballs (list): a list of dicts containing information about the + respective tarball that is provided by lister. + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Returns: + A list of dicts containing information about the respective + tarballs that are not previously archived. + + """ + # Done + versions = [] + + for release in tarballs: + if release[self.compare_field] in known_versions: + continue + versions.append(release) + + return versions + + def prepare_package_versions(self, tarballs, known_versions=None): + """ + Instantiate a generator that will process a specific package released + version at each iteration step. The following operations will be + performed: + + 1. Create a temporary directory to download and extract the + release tarball + 2. Download the tarball + 3. Uncompress the tarball + 4. Parse the file associated to the package version to extract + metadata (optional) + 5. Delete unnecessary files (optional) + + Args: + tarballs (list): a list of dicts containing information about the + respective tarball that is provided by lister. + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Yields: + Tuple[dict, str]: tuples containing the following + members: + + * a dict holding package tarball information and metadata + * a string holding the path of the uncompressed package to + load into the archive + + """ + new_versions = self.filter_package_versions(tarballs, known_versions) + for package_source_data in new_versions: + # filter out version with missing tarball, + # package visit will be marked as partial at the end of + # the loading process + + tarball_url = package_source_data['url'] + tarball_request = self._request(tarball_url, + throw_error=False) + if tarball_request.status_code == 404: + self.log.debug('Tarball url %s returns a 404 error.', + tarball_url) + continue + + yield self._prepare_package_version(package_source_data, + tarball_request) diff --git a/swh/loader/base/loader.py b/swh/loader/base/loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/base/loader.py @@ -0,0 +1,413 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import shutil + +from abc import abstractmethod +from tempfile import mkdtemp + +from swh.core import tarball +from .abstractattribute import AbstractAttribute +from swh.loader.core.utils import clean_dangling_folders +from swh.loader.core.loader import BufferedLoader +from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE +from swh.storage.algos.snapshot import snapshot_get_all_branches +from swh.model.from_disk import Directory + +from .build import construct_revision + +from swh.model.identifiers import ( + identifier_to_bytes, revision_identifier +) + +DEBUG_MODE = '** DEBUG MODE **' + + +class BaseLoader(BufferedLoader, construct_revision): + """ + + Required Overrides: + loader_name + class_name + def convert_to_standard_format + + Optional Overrides: + def cleanup_artifact + def extract_metadata + + """ + + loader_name = AbstractAttribute("Name of the package manager") # e.g pypi + class_name = AbstractAttribute("Name of the loader class") # eg PyPILoader + + def __init__(self): + super().__init__(logging_class='swh.loader.%s.%s' % (self.loader_name, + self.class_name)) + self.TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.%s.' % self.loader_name + self.CONFIG_BASE_FILENAME = 'loader/%s' % self.loader_name + + self.ADDITIONAL_CONFIG = { + 'temp_directory': ('str', + '/tmp/swh.loader.%s/' % self.loader_name), + 'cache': ('bool', False), + 'cache_dir': ('str', ''), + 'debug': ('bool', False), # NOT FOR PRODUCTION + } + + self.local_cache = None + self.dir_path = None + + temp_directory = self.config['temp_directory'] + os.makedirs(temp_directory, exist_ok=True) + + self.temp_directory = mkdtemp( + suffix='-%s' % os.getpid(), + prefix=self.TEMPORARY_DIR_PREFIX_PATTERN, + dir=temp_directory) + + self.debug = self.config.get('debug', False) + + @abstractmethod + def convert_to_standard_format(self, **kwargs): + """Fetch the metadata and convert it into a standard format + The standard format is a dict with keys + `name` (str): Holding name of the package + `origin_url` (str): Holding the origin_url of the package + `tarballs` (list): A list of dicts where each dict contains + information related to a single version of the + package. The `url` key in the dict is necessary and will + hold tarball url. Other keys are optional and as per + availability of metadata. + + Note: Keys `nature` and `response` are reserved keywords and cannot be + used in the dict that are present under key `tarballs` + + Args: + **kwargs: Arbitrary keyword arguments passed by the lister. + + Returns: + dict: Containing information as directed by the guidelines + mentioned above + + Example: + { + name:'8sync', + origin_url:'https://ftp.gnu.org/gnu/8sync/', + tarballs:[{url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', + time_modified: 1562878592 }, + {url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', + time_modified: 1599887203 }, + ... + ] + } + + """ + pass + + def cleanup_artifact(self, uncompressed_path): + """Clean up unnecessary files from the downloaded tarball + also some special operation if needed. + + Implementation of this method depends on the file structure of the + tarball. It is used to clean up files from the uncompressed tarball + that are not to be archived(eg binaries). + + Args: + uncompressed_path (str): Path of uncompressed tarball + + Returns: + uncompressed_path (str): Path of uncompressed tarball after + removing unnecessary files + + """ + return uncompressed_path + + def extract_metadata(self, package_path, package_source_data): + """Fetch the metadata from the downloaded file. + + """ + return package_source_data + + # You probably don't need to override anything below this line. + + def prepare_origin_visit(self, **kwargs): + """Prepare package visit. + + Args: + **kwargs: Arbitrary keyword arguments passed by the lister. + + """ + # reset statuses + self._load_status = 'uneventful' + self._visit_status = 'full' + self.done = False + # fetch the npm package metadata from the registry + self.package_details = self.convert_to_standard_format(**kwargs) + self.origin = { + 'url': self.package_details['origin_url'], + 'type': self.loader_name, + } + self.visit_date = None # loader core will populate it + + def prepare(self, **kwargs): + """Prepare effective loading of source tarballs for a package manager + package. + + Args: + **kwargs: Arbitrary keyword arguments passed by the lister. + + """ + self.contents = [] + self.directories = [] + self.revisions = [] + self.package_temp_dir = os.path.join(self.temp_directory, + self.package_details['name']) + + last_snapshot = self.last_snapshot() + self.known_versions = self.known_versions(last_snapshot) + + self.new_artifacts = \ + self.prepare_package_versions(self.package_details['tarballs'], + self.known_versions) + + def _prepare_package_version(self, package_source_data, tarball_request): + """Process the package release version. + + The following operations are performed: + + 1. Download the tarball + 2. Uncompress the tarball + 3. Delete unnecessary files (optional) + 4. Parse the file associated to the package version to extract + metadata (optional) + + Args: + package_source_data (list): a list of dicts containing information + about the respective tarball that is provided by lister. + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Return: + Tuple[dict, str]: tuples containing the following + members: + + * a dict holding package tarball information and metadata + * a string holding the path of the uncompressed package to + load into the archive + + """ + url = package_source_data['url'] + tarball_path, hashes = self.download_generate_hash(tarball_request, + url) + uncompressed_path = os.path.join(self.package_temp_dir, 'uncompressed', + url) # SEE ME + package_source_data['nature'] = self.uncompress_tarball( + tarball_path, uncompressed_path) + + # remove tarball + os.remove(tarball_path) + + if self.tarball_invalid: + return None, None + + package_path = self.cleanup_artifact(uncompressed_path) + package_source_data = self.extract_metadata(package_path, + package_source_data) + return package_source_data, package_path + + def fetch_data(self): + """Called once per release artifact version (can be many for one + release). + + This will for each call: + - retrieve a release artifact (associated to a release version) + - Computes the swh objects + + Returns: + True as long as data to fetch exist + + """ + data = None + if self.done: + return False + + try: + data = next(self.new_artifacts) + self._load_status = 'eventful' + except StopIteration: + self.done = True + return False + + package_source_data, dir_path = data + # package release tarball was corrupted + if self.tarball_invalid: + return not self.done + + dir_path = dir_path.encode('utf-8') + directory = Directory.from_disk(path=dir_path, data=True) + objects = directory.collect() + + if 'content' not in objects: + objects['content'] = {} + if 'directory' not in objects: + objects['directory'] = {} + + self.contents = objects['content'].values() + self.directories = objects['directory'].values() + + revision = self.compute_revision(directory, + package_source_data) # FIX ME + + revision['id'] = identifier_to_bytes( + revision_identifier(revision)) + self.revisions.append(revision) + + # Change me to be compatable with if modified since + package_key = package_source_data[self.compare_field] # check for this + self.known_versions[package_key] = revision['id'] # SEE ME + + self.log.debug('Removing unpacked package files at %s', dir_path) + shutil.rmtree(dir_path) + + return not self.done + + def last_snapshot(self): + """Retrieve the last snapshot of the package if any. + + """ + # Done + visit = self.storage.origin_visit_get_latest( + self.origin['url'], require_snapshot=True) + if visit: + return snapshot_get_all_branches(self.storage, visit['snapshot']) + + def store_data(self): + """Store fetched data in the database. + + """ + # Done + self.maybe_load_contents(self.contents) + self.maybe_load_directories(self.directories) + self.maybe_load_revisions(self.revisions) + + if self.done: + self.generate_and_load_snapshot() + self.flush() + + def generate_and_load_snapshot(self): + """ + Make me + """ + pass + + def download_generate_hash(self, response, url): + """Store file in temp directory and computes hash of its filepath + + Args: + response (Response): Server response of the url + url (str): Url of the tarball + + Returns: + Tuple of local (filepath, hashes of filepath) + + """ + # Done + length = int(response.headers['content-length']) + + # SEE ME + filepath = os.path.join(self.package_temp_dir, os.path.basename(url)) + + h = MultiHash(length=length) + with open(filepath, 'wb') as f: + for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): + h.update(chunk) + f.write(chunk) + + actual_length = os.path.getsize(filepath) + if length != actual_length: + raise ValueError('Error when checking size: %s != %s' % ( + length, actual_length)) + + hashes = { + 'length': length, + **h.hexdigest() + } + return filepath, hashes + + def uncompress_tarball(self, filepath, path): + """Uncompress a tarball. + + Args: + filepath (str): Path of tarball to uncompress + path (str): The destination folder where to uncompress the tarball + Returns: + The nature of the tarball, zip or tar. + + """ + # Done + # filepath = tempdir + url + try: + self.tarball_invalid = False + return tarball.uncompress(filepath, path) + except Exception: + self.tarball_invalid = True + return None + + def pre_cleanup(self): + """To prevent disk explosion if some other workers exploded + in mid-air (OOM killed), we try and clean up dangling files. + + """ + # Done + if self.debug: + self.log.warn('%s Will not pre-clean up temp dir %s' % ( + DEBUG_MODE, self.temp_directory + )) + return + clean_dangling_folders(self.config['temp_directory'], + pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN, + log=self.log) + + def flush(self): + """Flush any potential dangling data not sent to swh-storage. + + Bypass the maybe_load_* methods which awaits threshold reached + signal. We actually want to store those as we are done + loading. + + """ + # Done + contents = self.contents.pop() + directories = self.directories.pop() + revisions = self.revisions.pop() + releases = self.releases.pop() + + # and send those to storage if asked + if self.config['send_contents']: + self.send_batch_contents(contents) + if self.config['send_contents']: + self.send_batch_directories(directories) + if self.config['send_revisions']: + self.send_batch_revisions(revisions) + if self.config['send_releases']: + self.send_batch_releases(releases) + if self.config['send_snapshot'] and self.snapshot: + self.send_snapshot(self.snapshot) + + def cleanup(self): + """Clean up temporary disk use after downloading and extracting + package tarballs. + + """ + # Done + if self.debug: + self.log.warn('%s Will not clean up temp dir %s' % ( + DEBUG_MODE, self.temp_directory + )) + return + if os.path.exists(self.temp_directory): + self.log.debug('Clean up %s' % self.temp_directory) + shutil.rmtree(self.temp_directory) diff --git a/swh/loader/base/tests/__init__.py b/swh/loader/base/tests/__init__.py new file mode 100644 diff --git a/swh/loader/base/tests/test_download.py b/swh/loader/base/tests/test_download.py new file mode 100644 diff --git a/swh/loader/base/tests/test_loader.py b/swh/loader/base/tests/test_loader.py new file mode 100644