diff --git a/swh/loader/base/__init__.py b/swh/loader/base/__init__.py new file mode 100644 diff --git a/swh/loader/base/build_revision.py b/swh/loader/base/build_revision.py new file mode 100644 --- /dev/null +++ b/swh/loader/base/build_revision.py @@ -0,0 +1,104 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from datetime import datetime +from swh.model.identifiers import normalize_timestamp +from dateutil import parser as date_parser + + +class construct_revision: + """Construct revision from the metadata of the package version + + Construct the revision for a package version using the metadata provided. + There are several hookpoints that can be overridden as per the need of + package manager. + + """ + + UTC_OFFSET = 0 + SWH_PERSON = { + 'name': b'Software Heritage', + 'fullname': b'Software Heritage', + 'email': b'robot@softwareheritage.org' + } + REVISION_MESSAGE = b'swh-loader-base: synthetic revision message' + + def modify_revision(self, revision): + """Make modification on revision created + If the revision of a package manager needs to be in a specific format, + this method can be overridden to perform that operation insted of + overriding the `compute_revision()`. + + Args: + revision (dict): Created revision + + Returns: + dict: Modified revision + """ + return revision + + def compute_revision(self, directory, package_source_data): + """Compute a revision. + + Args: + directory (str): absolute path to the tarball + package_source_data (dict): Information about the package + release version + + Returns: + dict: Revision + + """ + revision = { + 'metadata': self.find_metadata(package_source_data), + 'date': { + 'timestamp': self.find_date(package_source_data), + 'offset': self.UTC_OFFSET, + }, + 'committer_date': { + 'timestamp': self.find_date(package_source_data), + 'offset': self.UTC_OFFSET, + }, + 'author': self.find_author(package_source_data), + 'committer': self.find_author(package_source_data), + 'type': self.find_type(package_source_data), + 'message': self.find_message(package_source_data), + 'directory': self.directory(directory), + 'synthetic': self.find_synthetic(), + 'parents': [], + } + + return self.modify_revision(revision) + + def find_synthetic(self): + return True + + def find_type(self, package_source_data): + return package_source_data['nature'] + + def find_message(self, package_source_data): + return self.REVISION_MESSAGE + + def directory(self, directory): + return directory.hash + + def find_author(self, package_source_data): + if 'author' in package_source_data: + return package_source_data['author'] + return self.SWH_PERSON + + def find_metadata(self, package_source_data): + return { + 'package': package_source_data + } + + def find_date(self, package_source_data): + try: + # if 'date' in package_source_data: + date = date_parser.parse(package_source_data['date']) + return normalize_timestamp(int(date.timestamp())) + except Exception: + now = datetime.now() + return normalize_timestamp(int(datetime.timestamp(now))) diff --git a/swh/loader/base/dowload.py b/swh/loader/base/dowload.py new file mode 100644 --- /dev/null +++ b/swh/loader/base/dowload.py @@ -0,0 +1,370 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import requests +import time + +try: + from _version import __version__ +except ImportError: + __version__ = 'devel' + + +# This file contains methods to check and remove archived package version + + +class If_Modified_Since: + """Uses if_modified_then header to check for if the package is previously + archived. + + This class is to be used to identify and avoid the reprocessing of + previously archived package version when there is no reliable field is + provided in the metadata that can so the job. + + The following operations are performed: + - Retrive known versions and store then in a dict with tarball `url` + as key(can be changed by overriding `get_key` method) + - Check if the tarballs are present in knowns versions. + * If the match found, it sends a request with `if_modified_since` + header to confirm the match + * If the match is not found it sends a simple request + - Store the request and the time for further processing + - Instantiate a generator to process a specific package released + version + + """ + def __init__(self): + self.session = requests.session() + self.time_last_visit = {} + self.params = { + 'headers': { + 'User-Agent': 'Software Heritage Loader (%s)' % ( + __version__ + ) + } + } + + def get_artifact(self, revision): + """Fetch artifact form revision + + Args: + revision (dict): Previous revision + + Returns: + dict: metadata present in the revision + + """ + return revision['metadata']['package'] + + def get_key(self): + """Returns the key to be used to identify known revisions + + """ + return 'url' + + def known_versions(self, last_snapshot): + """ + Retrieve the known release versions for the npm package + (i.e. those already ingested into the archive). + + Args + last_snapshot (dict): Last snapshot for the visit + + Returns: + dict: Dict whose key is url and values are revision ids. + + """ + if not last_snapshot or 'branches' not in last_snapshot: + return {} + + # retrieve only revisions (e.g the alias we do not want here) + revs = [rev['target'] + for rev in last_snapshot['branches'].values() + if rev and rev['target_type'] == 'revision'] + known_revisions = self.storage.revision_get(revs) + ret = {} + key = self.get_key() + for revision in known_revisions: + if not revision: # revision_get can return None + continue + + artifact = self.get_artifact(revision) + ret[artifact[key]] = revision['id'] + self.time_last_visit[artifact[key]] = artifact['time_last_visit'] + return ret + + def filter_package_versions(self, tarballs, known_versions): + """Return the available tarballs that are not previously archived. + + Args: + tarballs (list): a list of dicts containing information about the + respective tarball that is provided by lister. + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Returns: + A list of dicts containing information about the respective + tarballs that are not previously archived. + + """ + # Done + versions = [] + key = self.get_key() + for release in tarballs: + tarball_url = release['url'] + + if release[key] in known_versions: + tarball_request = self._request( + tarball_url, + time_last_visit=self.time_last_visit[release[key]], + throw_error=False) + else: + tarball_request = self._request( + tarball_url, time_last_visit=None, throw_error=False) + + if tarball_request.status_code == 304: + continue + + elif tarball_request.status_code != 200: + self.log.debug("Fail to query '%s'. Reason: %s" % ( + tarball_url, tarball_request.status_code)) + continue + + new_release = self.update_release_info(release, tarball_request) + versions.append(new_release) + + return versions + + def update_release_info(self, release, tarball_request): + """ + + """ + release['response'] = tarball_request + time_now = time.time() + time_now = time.strftime('%a, %d %b %Y %H:%M:%S GMT', + time.gmtime(time_now)) + release['time_last_visit'] = time_now + return release + + def _request(self, url, time_last_visit=None, throw_error=True): + """Request the remote tarball url. + + Args: + url (str): Url (file or http*) + + Raises: + ValueError in case of failing to query + + Returns: + server response + + """ + # Done + if time_last_visit: + self.params['headers']['If-Modified-Since'] = time_last_visit + response = self.session.get(url, **self.params, stream=True) + return response + + def prepare_package_versions(self, tarballs, known_versions=None): + """ + Instantiate a generator that will process a specific package released + version at each iteration step. The following operations will be + performed: + + 1. Create a temporary directory to download and extract the + release tarball + 2. Download the tarball + 3. Uncompress the tarball + 4. Parse the file associated to the package version to extract + metadata (optional) + 5. Delete unnecessary files (optional) + + Args: + tarballs (list): a list of dicts containing information about the + respective tarball that is provided by lister. + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Yields: + Tuple[dict, str]: tuples containing the following + members: + + * a dict holding package tarball information and metadata + * a string holding the path of the uncompressed package to + load into the archive + + """ + new_versions = self.filter_package_versions(tarballs, known_versions) + for package_source_data in new_versions: + tarball_request = package_source_data['response'] + + # To make things simple while creating revisions + del package_source_data['response'] + yield self._prepare_package_version(package_source_data, + tarball_request) + + +class compareField: + """Uses a field present in the metadata to check for if the package is + previously archived. + + This class is to be used to identify and avoid the reprocessing of + previously archived package version using a field provided by the + API in the metadata of the package version + + The following operations are performed: + - Retrive known versions and store then in a dict with key of same + field that is mentioned in compare field + - Check if the tarballs are present in knowns versions. + - Instantiate a generator to process a specific package released + version + + """ + compare_field = """Field used to identify if the package + version is previously archived""" + # eg for pypi loader compare_field = 'sha' + + def __init__(self): + self.session = requests.session() + self.params = { + 'headers': { + 'User-Agent': 'Software Heritage Loader (%s)' % ( + __version__ + ) + } + } + + def get_key(self): + """Returns the key to be used to identify known revisions + + """ + return self.compare_field + + def _request(self, url, throw_error=True): + """Request the remote tarball url. + + Args: + url (str): Url (file or http*) + + Raises: + ValueError in case of failing to query + + Returns: + Tuple of local (filepath, hashes of filepath) + + """ + # Done + response = self.session.get(url, **self.params, stream=True) + if response.status_code != 200 and throw_error: + raise ValueError("Fail to query '%s'. Reason: %s" % ( + url, response.status_code)) + return response + + def known_versions(self, last_snapshot): + """ + Retrieve the known release versions for the npm package + (i.e. those already ingested into the archive). + + Args + last_snapshot (dict): Last snapshot for the visit + + Returns: + dict: Dict whose key is the value of field chosen for + checking archived artifacts and values are revision ids. + + """ + if not last_snapshot or 'branches' not in last_snapshot: + return {} + + # retrieve only revisions (e.g the alias we do not want here) + revs = [rev['target'] + for rev in last_snapshot['branches'].values() + if rev and rev['target_type'] == 'revision'] + known_revisions = self.storage.revision_get(revs) + ret = {} + for revision in known_revisions: + if not revision: # revision_get can return None + continue + artifact = self.artifact_from_revision(revision) + ret[artifact[self.compare_field]] = revision['id'] # Check me + return ret + + def artifact_from_revision(self, revision): + """Find artifacts from the revision + """ + # Can be overridden if change in standard revision pattern + if 'package' in revision['metadata']: + return revision['metadata']['package'] + + def filter_package_versions(self, tarballs, known_versions): + """ + Return the available tarballs that are not previously archived. + + Args: + tarballs (list): a list of dicts containing information about the + respective tarball that is provided by lister. + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Returns: + A list of dicts containing information about the respective + tarballs that are not previously archived. + + """ + # Done + versions = [] + + for release in tarballs: + if release[self.compare_field] in known_versions: + continue + versions.append(release) + + return versions + + def prepare_package_versions(self, tarballs, known_versions=None): + """ + Instantiate a generator that will process a specific package released + version at each iteration step. The following operations will be + performed: + + 1. Create a temporary directory to download and extract the + release tarball + 2. Download the tarball + 3. Uncompress the tarball + 4. Parse the file associated to the package version to extract + metadata (optional) + 5. Delete unnecessary files (optional) + + Args: + tarballs (list): a list of dicts containing information about the + respective tarball that is provided by lister. + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Yields: + Tuple[dict, str]: tuples containing the following + members: + + * a dict holding package tarball information and metadata + * a string holding the path of the uncompressed package to + load into the archive + + """ + new_versions = self.filter_package_versions(tarballs, known_versions) + for package_source_data in new_versions: + # filter out version with missing tarball, + # package visit will be marked as partial at the end of + # the loading process + + tarball_url = package_source_data['url'] + tarball_request = self._request(tarball_url, + throw_error=False) + if tarball_request.status_code == 404: + self.log.debug('Tarball url %s returns a 404 error.', + tarball_url) + continue + + yield self._prepare_package_version(package_source_data, + tarball_request) diff --git a/swh/loader/base/loader.py b/swh/loader/base/loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/base/loader.py @@ -0,0 +1,471 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import shutil + +from abc import abstractmethod +from tempfile import mkdtemp + +from swh.core import tarball +from swh.loader.core.utils import clean_dangling_folders +from swh.loader.core.loader import BufferedLoader +from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE +from swh.storage.algos.snapshot import snapshot_get_all_branches +from swh.model.from_disk import Directory + +from .build_revision import construct_revision + +from swh.model.identifiers import ( + identifier_to_bytes, revision_identifier, snapshot_identifier +) + +DEBUG_MODE = '** DEBUG MODE **' + + +class BaseLoader(BufferedLoader, construct_revision): + """ + + Required Overrides: + loader_name + class_name + def convert_to_standard_format + + Optional Overrides: + def cleanup_artifact + def extract_metadata + + """ + + loader_name = """Name of the package manager""" # e.g pypi + class_name = """Name of the loader class""" # eg PyPILoader + + def __init__(self): + print('swh.loader.%s.%s' % (self.loader_name, self.class_name)) + self.TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.%s.' % self.loader_name + self.CONFIG_BASE_FILENAME = 'loader/%s' % self.loader_name + + self.ADDITIONAL_CONFIG = { + 'temp_directory': ('str', + '/tmp/swh.loader.%s/' % self.loader_name), + 'debug': ('bool', False), # NOT FOR PRODUCTION + } + super().__init__(logging_class='swh.loader.%s.%s' % (self.loader_name, + self.class_name)) + + self.local_cache = None + self.dir_path = None + + temp_directory = self.config['temp_directory'] + os.makedirs(temp_directory, exist_ok=True) + + self.temp_directory = mkdtemp( + suffix='-%s' % os.getpid(), + prefix=self.TEMPORARY_DIR_PREFIX_PATTERN, + dir=temp_directory) + + self.debug = self.config.get('debug', False) + + @abstractmethod + def convert_to_standard_format(self, **kwargs): + """Fetch the metadata and convert it into a standard format + The standard format is a dict with keys + `name` (str): Holding name of the package + `origin_url` (str): Holding the origin_url of the package + `tarballs` (list): A list of dicts where each dict contains + information related to a single version of the + package. The `url` key in the dict is necessary and will + hold tarball url. Other keys are optional and as per + availability of metadata. + + Note: Keys `nature` and `response` are reserved keywords and cannot be + used in the dict that are present under key `tarballs` + + Args: + **kwargs: Arbitrary keyword arguments passed by the lister. + + Returns: + dict: Containing information as directed by the guidelines + mentioned above + + Example: + { + name:'8sync', + origin_url:'https://ftp.gnu.org/gnu/8sync/', + tarballs:[{url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz', + time_modified: 1562878592 }, + {url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz', + time_modified: 1599887203 }, + ... + ] + } + + """ + pass + + def cleanup_artifact(self, uncompressed_path): + """Clean up unnecessary files from the downloaded tarball + also some special operation if needed. + + Implementation of this method depends on the file structure of the + tarball. It is used to clean up files from the uncompressed tarball + that are not to be archived(eg binaries). + + Args: + uncompressed_path (str): Path of uncompressed tarball + + Returns: + uncompressed_path (str): Path of uncompressed tarball after + removing unnecessary files + + """ + return uncompressed_path + + def extract_metadata(self, package_path, package_source_data): + """Fetch the metadata from the downloaded file. + + """ + return package_source_data + + # Do something + def prepare_origin_visit(self, **kwargs): + """Prepare package visit. + + Args: + **kwargs: Arbitrary keyword arguments passed by the lister. + + """ + # reset statuses + self._load_status = 'uneventful' + self._visit_status = 'full' + self.done = False + # fetch the npm package metadata from the registry + self.package_details = self.convert_to_standard_format(**kwargs) + self.set_origin() + self.visit_date = None # loader core will populate it + + def set_origin(self): + """Assign value to self.origin + """ + self.origin = { + 'url': self.package_details['origin_url'], + 'type': self.loader_name, + } + + def prepare(self, **kwargs): + """Prepare effective loading of source tarballs for a package manager + package. + + Args: + **kwargs: Arbitrary keyword arguments passed by the lister. + + """ + self.package_contents = [] + self.package_directories = [] + self.package_revisions = [] + self.package_source_data = [] + self.package_temp_dir = os.path.join(self.temp_directory, + self.package_details['name']) + + last_snapshot = self.last_snapshot() + self.known_versions = self.known_versions(last_snapshot) + + self.new_artifacts = \ + self.prepare_package_versions(self.package_details['tarballs'], + self.known_versions) + + def last_snapshot(self): + """Retrieve the last snapshot of the package if any. + + """ + # Done + visit = self.storage.origin_visit_get_latest( + self.origin['url'], require_snapshot=True) + if visit: + return snapshot_get_all_branches(self.storage, visit['snapshot']) + + def _prepare_package_version(self, package_source_data, tarball_request): + """Process the package release version. + + The following operations are performed: + + 1. Download the tarball + 2. Uncompress the tarball + 3. Delete unnecessary files (optional) + 4. Parse the file associated to the package version to extract + metadata (optional) + + Args: + package_source_data (list): a list of dicts containing information + about the respective tarball that is provided by lister. + known_versions (dict): may be provided by the loader, it enables + to filter out versions already ingested in the archive. + + Return: + Tuple[dict, str]: tuples containing the following + members: + + * a dict holding package tarball information and metadata + * a string holding the path of the uncompressed package to + load into the archive + + """ + url = package_source_data['url'] + tarball_path, hashes = self.download_generate_hash(tarball_request, + url) + uncompressed_path = os.path.join(self.package_temp_dir, 'uncompressed', + os.path.basename(url)) # SEE ME + package_source_data['nature'] = self.uncompress_tarball( + tarball_path, uncompressed_path) + + # remove tarball + os.remove(tarball_path) + + if self.tarball_invalid: + return None, None + + package_path = self.cleanup_artifact(uncompressed_path) + package_source_data = self.extract_metadata(package_path, + package_source_data) + self.package_source_data.append(package_source_data) + return package_source_data, package_path + + # generate + def download_generate_hash(self, response, url): + """Store file in temp directory and computes hash of its filepath + + Args: + response (Response): Server response of the url + url (str): Url of the tarball + + Returns: + Tuple of local (filepath, hashes of filepath) + + """ + # Done + length = int(response.headers['content-length']) + os.makedirs(self.package_temp_dir, exist_ok=True) + # SEE ME + filepath = os.path.join(self.package_temp_dir, os.path.basename(url)) + + h = self.write_file(filepath, length, response) + + self.check_file(filepath, length) + + hashes = { + 'length': length, + **h.hexdigest() + } + return filepath, hashes + + def write_file(self, filepath, length, response): + """ + + """ + h = MultiHash(length=length) + with open(filepath, 'wb') as f: + for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE): + h.update(chunk) + f.write(chunk) + return h + + def check_file(self, length, filepath): + """ + + """ + actual_length = os.path.getsize(filepath) + if length != actual_length: + raise ValueError('Error when checking size: %s != %s' % ( + length, actual_length)) + + def uncompress_tarball(self, filepath, path): + """Uncompress a tarball. + + Args: + filepath (str): Path of tarball to uncompress + path (str): The destination folder where to uncompress the tarball + Returns: + The nature of the tarball, zip or tar. + + """ + # Done + # filepath = tempdir + url + try: + self.tarball_invalid = False + return tarball.uncompress(filepath, path) + except Exception: + self.tarball_invalid = True + return None + + def fetch_data(self): + """Called once per release artifact version (can be many for one + release). + + This will for each call: + - retrieve a release artifact (associated to a release version) + - Computes the swh objects + + Returns: + True as long as data to fetch exist + + """ + data = None + if self.done: + return False + + try: + data = next(self.new_artifacts) + self._load_status = 'eventful' + except StopIteration: + self.done = True + return False + + package_source_data, dir_path = data + + # package release tarball was corrupted + if self.tarball_invalid: + return not self.done + + dir_path = dir_path.encode('utf-8') + directory = Directory.from_disk(path=dir_path, data=True) + objects = directory.collect() + + objects = self.check_objects(objects) + + self.package_contents = objects['content'].values() + self.package_directories = objects['directory'].values() + + revision = self.compute_revision(directory, + package_source_data) + + revision['id'] = identifier_to_bytes( + revision_identifier(revision)) + self.package_revisions.append(revision) + + self.update_known_version(package_source_data, revision) + + self.log.debug('Removing unpacked package files at %s', dir_path) + shutil.rmtree(dir_path) + + return not self.done + + def update_known_version(self, package_source_data, revision): + """ + + """ + key = self.get_key() + package_key = package_source_data[key] + self.known_versions[package_key] = revision['id'] # SEE ME + + def check_objects(self, objects): + """ + """ + if 'content' not in objects: + objects['content'] = {} + if 'directory' not in objects: + objects['directory'] = {} + return objects + + def store_data(self): + """Store fetched data in the database. + + """ + # Done + self.maybe_load_contents(self.package_contents) + self.maybe_load_directories(self.package_directories) + self.maybe_load_revisions(self.package_revisions) + + if self.done: + self.generate_and_load_snapshot() + self.flush() + + def generate_and_load_snapshot(self): + """Generate and load snapshot for the package visit. + + """ + # Done + snapshot = { + 'branches': self.generate_branches(), + } + snapshot['id'] = identifier_to_bytes(snapshot_identifier(snapshot)) + self.maybe_load_snapshot(snapshot) + + def generate_branches(self): + """ + Make ME + """ + branches = {} + key = self.get_key() + for version in self.package_source_data: + branch_name = self.branch_name(version) + target = self.target_from_version(version[key]) + branches[branch_name] = target + branches = self.find_head(branches, branch_name) + # How to find HEAD and branch name? + + if not target: + self.package_visit_status = 'partial' + + return branches + + def find_head(self, branches, branch_name): + """ + """ + if True: # I don't know what to do here + branches[b'HEAD'] = { + 'target_type': 'alias', + 'target': branch_name, + } + return branches + + def target_from_version(self, key_value): + target = self.known_versions.get(key_value) + return { + 'target': target, + 'target_type': 'revision', + } if target else None + + def branch_name(self, version): + """Find branch name + + Args: + version (dict): All the data related to a particular version + + Returns: + Branch name encoded in ascii + + """ + # How to tackle this + pass + + def pre_cleanup(self): + """To prevent disk explosion if some other workers exploded + in mid-air (OOM killed), we try and clean up dangling files. + + """ + # Done + if self.debug: + self.log.warn('%s Will not pre-clean up temp dir %s' % ( + DEBUG_MODE, self.temp_directory + )) + return + clean_dangling_folders(self.config['temp_directory'], + pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN, + log=self.log) + + def cleanup(self): + """Clean up temporary disk use after downloading and extracting + package tarballs. + + """ + # Done + if self.debug: + self.log.warn('%s Will not clean up temp dir %s' % ( + DEBUG_MODE, self.temp_directory + )) + return + if os.path.exists(self.temp_directory): + self.log.debug('Clean up %s' % self.temp_directory) + shutil.rmtree(self.temp_directory) diff --git a/swh/loader/base/tests/__init__.py b/swh/loader/base/tests/__init__.py new file mode 100644 diff --git a/swh/loader/base/tests/test_build_revision.py b/swh/loader/base/tests/test_build_revision.py new file mode 100644 --- /dev/null +++ b/swh/loader/base/tests/test_build_revision.py @@ -0,0 +1,41 @@ +# Copyright (C) 2015-2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest +from unittest.mock import patch + +from swh.loader.base import build_revision as build + +# Under construction + + +class TestBuild_Revision(unittest.TestCase): + @patch('swh.loader.base.build_revision.directory') + def test_compute_revision(self, mock_directory): + package_source_data = {} + mock_directory.return_value = 'some-other-time' + actual_revision = build.compute_revision('/some/path', + package_source_data) + # make me + expected_revision = { + 'date': { + 'timestamp': 'some-other-time', + 'offset': build.UTC_OFFSET, + }, + 'committer_date': { + 'timestamp': 'some-other-time', + 'offset': build.UTC_OFFSET, + }, + 'author': build.SWH_PERSON, + 'committer': build.SWH_PERSON, + 'type': build.REVISION_TYPE, + 'message': build.REVISION_MESSAGE, + 'synthetic': True, + } + + # then + self.assertEqual(actual_revision, expected_revision) + + mock_directory.assert_called_once_with('/some/path') diff --git a/swh/loader/base/tests/test_download.py b/swh/loader/base/tests/test_download.py new file mode 100644 diff --git a/swh/loader/base/tests/test_loader.py b/swh/loader/base/tests/test_loader.py new file mode 100644 diff --git a/swh/loader/cran/loader.py b/swh/loader/cran/loader.py new file mode 100644 --- /dev/null +++ b/swh/loader/cran/loader.py @@ -0,0 +1,18 @@ +from swh.loader.base.loader import BaseLoader +from swh.loader.base.dowload import If_Modified_Since + + +class CRANLoader(BaseLoader, If_Modified_Since): + loader_name = 'cran' + class_name = 'CRANLoader' + + def __init__(self): + BaseLoader.__init__(self) + If_Modified_Since.__init__(self) + + def convert_to_standard_format(self, **kwargs): + return { + 'name': kwargs['name'], + 'origin_url': kwargs['origin_url'], + 'tarballs': kwargs['tarballs'] + }