diff --git a/debian/control b/debian/control index 3456ab0..b0d9f99 100644 --- a/debian/control +++ b/debian/control @@ -1,21 +1,24 @@ Source: swh-loader-svn Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-setuptools, python3-swh.core (>= 0.0.18), python3-swh.storage, python3-swh.model (>= 0.0.4), python3-swh.scheduler (>= 0.0.6), + python3-swh.loader.core, + python3-retrying, + python3-click, python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDSVN/ Package: python3-swh.loader.svn Architecture: all Depends: ${misc:Depends}, ${python3:Depends} Description: Software Heritage diff --git a/requirements.txt b/requirements.txt index 4a37d36..9914282 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,10 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner swh.core >= 0.0.18 swh.model >= 0.0.4 swh.scheduler >= 0.0.6 +swh.loader.core +Click +retrying diff --git a/swh/loader/svn/libloader.py b/swh/loader/svn/libloader.py deleted file mode 100644 index 0cb76a1..0000000 --- a/swh/loader/svn/libloader.py +++ /dev/null @@ -1,426 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import logging -import psycopg2 -import requests -import traceback -import uuid - -from retrying import retry - -from swh.core import config - -from swh.loader.dir import converters -from swh.model.git import GitType -from swh.storage import get_storage - -from swh.loader.svn.queue import QueuePerSizeAndNbUniqueElements -from swh.loader.svn.queue import QueuePerNbUniqueElements -from swh.loader.svn.queue import QueuePerNbElements - - -def retry_loading(error): - """Retry policy when the database raises an integrity error""" - exception_classes = [ - # raised when two parallel insertions insert the same data. - psycopg2.IntegrityError, - # raised when uWSGI restarts and hungs up on the worker. - requests.exceptions.ConnectionError, - ] - - if not any(isinstance(error, exc) for exc in exception_classes): - return False - - logger = logging.getLogger('swh.loader') - - error_name = error.__module__ + '.' + error.__class__.__name__ - logger.warning('Retry loading a batch', exc_info=False, extra={ - 'swh_type': 'storage_retry', - 'swh_exception_type': error_name, - 'swh_exception': traceback.format_exception( - error.__class__, - error, - error.__traceback__, - ), - }) - - return True - - -def shallow_blob(obj): - """Convert a full swh content/blob to just what's needed by - swh-storage for filtering. - - Returns: - A shallow copy of a full swh content/blob object. - - """ - return { - 'sha1': obj['sha1'], - 'sha256': obj['sha256'], - 'sha1_git': obj['sha1_git'], - 'length': obj['length'] - } - - -def shallow_tree(tree): - """Convert a full swh directory/tree to just what's needed by - swh-storage for filtering. - - Returns: - A shallow copy of a full swh directory/tree object. - - """ - return tree['sha1_git'] - - -def shallow_commit(commit): - """Convert a full swh revision/commit to just what's needed by - swh-storage for filtering. - - Returns: - A shallow copy of a full swh revision/commit object. - - """ - return commit['id'] - - -def shallow_tag(tag): - """Convert a full swh release/tag to just what's needed by - swh-storage for filtering. - - Returns: - A shallow copy of a full swh release/tag object. - - """ - return tag['id'] - - -class SWHLoader(config.SWHConfig): - """A svn loader. - - This will load the svn repository. - - """ - def __init__(self, config, revision_type, origin_id, logging_class): - self.config = config - - self.origin_id = origin_id - self.storage = get_storage(config['storage_class'], - config['storage_args']) - self.revision_type = revision_type - - self.log = logging.getLogger(logging_class) - - self.contents = QueuePerSizeAndNbUniqueElements( - key='sha1', - max_nb_elements=self.config['content_packet_size'], - max_size=self.config['content_packet_block_size_bytes']) - - self.contents_seen = set() - - self.directories = QueuePerNbUniqueElements( - key='id', - max_nb_elements=self.config['directory_packet_size']) - - self.directories_seen = set() - - self.revisions = QueuePerNbUniqueElements( - key='id', - max_nb_elements=self.config['revision_packet_size']) - - self.revisions_seen = set() - - self.releases = QueuePerNbUniqueElements( - key='id', - max_nb_elements=self.config['release_packet_size']) - - self.releases_seen = set() - - self.occurrences = QueuePerNbElements( - self.config['occurrence_packet_size']) - - l = logging.getLogger('requests.packages.urllib3.connectionpool') - l.setLevel(logging.WARN) - - @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) - def send_contents(self, content_list): - """Actually send properly formatted contents to the database""" - num_contents = len(content_list) - if num_contents > 0: - log_id = str(uuid.uuid4()) - self.log.debug("Sending %d contents" % num_contents, - extra={ - 'swh_type': 'storage_send_start', - 'swh_content_type': 'content', - 'swh_num': num_contents, - 'swh_id': log_id, - }) - self.storage.content_add(content_list) - self.log.debug("Done sending %d contents" % num_contents, - extra={ - 'swh_type': 'storage_send_end', - 'swh_content_type': 'content', - 'swh_num': num_contents, - 'swh_id': log_id, - }) - - @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) - def send_directories(self, directory_list): - """Actually send properly formatted directories to the database""" - num_directories = len(directory_list) - if num_directories > 0: - log_id = str(uuid.uuid4()) - self.log.debug("Sending %d directories" % num_directories, - extra={ - 'swh_type': 'storage_send_start', - 'swh_content_type': 'directory', - 'swh_num': num_directories, - 'swh_id': log_id, - }) - self.storage.directory_add(directory_list) - self.log.debug("Done sending %d directories" % num_directories, - extra={ - 'swh_type': 'storage_send_end', - 'swh_content_type': 'directory', - 'swh_num': num_directories, - 'swh_id': log_id, - }) - - @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) - def send_revisions(self, revision_list): - """Actually send properly formatted revisions to the database""" - num_revisions = len(revision_list) - if num_revisions > 0: - log_id = str(uuid.uuid4()) - self.log.debug("Sending %d revisions" % num_revisions, - extra={ - 'swh_type': 'storage_send_start', - 'swh_content_type': 'revision', - 'swh_num': num_revisions, - 'swh_id': log_id, - }) - self.storage.revision_add(revision_list) - self.log.debug("Done sending %d revisions" % num_revisions, - extra={ - 'swh_type': 'storage_send_end', - 'swh_content_type': 'revision', - 'swh_num': num_revisions, - 'swh_id': log_id, - }) - - @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) - def send_releases(self, release_list): - """Actually send properly formatted releases to the database""" - num_releases = len(release_list) - if num_releases > 0: - log_id = str(uuid.uuid4()) - self.log.debug("Sending %d releases" % num_releases, - extra={ - 'swh_type': 'storage_send_start', - 'swh_content_type': 'release', - 'swh_num': num_releases, - 'swh_id': log_id, - }) - self.storage.release_add(release_list) - self.log.debug("Done sending %d releases" % num_releases, - extra={ - 'swh_type': 'storage_send_end', - 'swh_content_type': 'release', - 'swh_num': num_releases, - 'swh_id': log_id, - }) - - @retry(retry_on_exception=retry_loading, stop_max_attempt_number=3) - def send_occurrences(self, occurrence_list): - """Actually send properly formatted occurrences to the database""" - num_occurrences = len(occurrence_list) - if num_occurrences > 0: - log_id = str(uuid.uuid4()) - self.log.debug("Sending %d occurrences" % num_occurrences, - extra={ - 'swh_type': 'storage_send_start', - 'swh_content_type': 'occurrence', - 'swh_num': num_occurrences, - 'swh_id': log_id, - }) - self.storage.occurrence_add(occurrence_list) - self.log.debug("Done sending %d occurrences" % num_occurrences, - extra={ - 'swh_type': 'storage_send_end', - 'swh_content_type': 'occurrence', - 'swh_num': num_occurrences, - 'swh_id': log_id, - }) - - def filter_missing_blobs(self, blobs): - """Filter missing blob from swh. - - """ - max_content_size = self.config['content_packet_size_bytes'] - blobs_per_sha1 = {} - shallow_blobs = [] - for key, blob in ((b['sha1'], b) for b in blobs - if b['sha1'] not in self.contents_seen): - blobs_per_sha1[key] = blob - shallow_blobs.append(shallow_blob(blob)) - self.contents_seen.add(key) - - for sha1 in self.storage.content_missing(shallow_blobs, - key_hash='sha1'): - yield converters.blob_to_content(blobs_per_sha1[sha1], - max_content_size=max_content_size, - origin_id=self.origin_id) - - def bulk_send_blobs(self, blobs): - """Format blobs as swh contents and send them to the database""" - threshold_reached = self.contents.add( - self.filter_missing_blobs(blobs)) - if threshold_reached: - self.send_contents(self.contents.pop()) - - def filter_missing_trees(self, trees, objects): - """Filter missing tree from swh. - - """ - trees_per_sha1 = {} - shallow_trees = [] - for key, tree in ((t['sha1_git'], t) for t in trees - if t['sha1_git'] not in self.directories_seen): - trees_per_sha1[key] = tree - shallow_trees.append(shallow_tree(tree)) - self.directories_seen.add(key) - - for sha in self.storage.directory_missing(shallow_trees): - yield converters.tree_to_directory(trees_per_sha1[sha], objects) - - def bulk_send_trees(self, objects, trees): - """Format trees as swh directories and send them to the database""" - threshold_reached = self.directories.add( - self.filter_missing_trees(trees, objects)) - if threshold_reached: - self.send_contents(self.contents.pop()) - self.send_directories(self.directories.pop()) - - def filter_missing_commits(self, commits): - """Filter missing commit from swh. - - """ - commits_per_sha1 = {} - shallow_commits = [] - for key, commit in ((c['id'], c) for c in commits - if c['id'] not in self.revisions_seen): - commits_per_sha1[key] = commit - shallow_commits.append(shallow_commit(commit)) - self.revisions_seen.add(key) - - for sha in self.storage.revision_missing(shallow_commits, - type=self.revision_type): - yield commits_per_sha1[sha] - - def bulk_send_commits(self, commits): - """Format commits as swh revisions and send them to the database. - - """ - threshold_reached = self.revisions.add( - self.filter_missing_commits(commits)) - if threshold_reached: - self.send_contents(self.contents.pop()) - self.send_directories(self.directories.pop()) - self.send_revisions(self.revisions.pop()) - - def filter_missing_tags(self, tags): - """Filter missing tags from swh. - - """ - tags_per_sha1 = {} - shallow_tags = [] - for key, tag in ((t['id'], t) for t in tags - if t['id'] not in self.releases_seen): - tags_per_sha1[key] = tag - shallow_tags.append(shallow_tag(tag)) - self.releases_seen.add(key) - - for sha in self.storage.release_missing(shallow_tags, - type=self.revision_type): - yield tags_per_sha1[sha] - - def bulk_send_annotated_tags(self, tags): - """Format annotated tags (pygit2.Tag objects) as swh releases and send - them to the database. - - """ - threshold_reached = self.releases.add( - self.filter_missing_tags(tags)) - if threshold_reached: - self.send_contents(self.contents.pop()) - self.send_directories(self.directories.pop()) - self.send_revisions(self.revisions.pop()) - self.send_releases(self.releases.pop()) - - def bulk_send_refs(self, refs): - """Format git references as swh occurrences and send them to the - database. - - """ - threshold_reached = self.occurrences.add( - (converters.ref_to_occurrence(r) for r in refs)) - if threshold_reached: - self.send_contents(self.contents.pop()) - self.send_directories(self.directories.pop()) - self.send_revisions(self.revisions.pop()) - self.send_releases(self.releases.pop()) - self.send_occurrences(self.occurrences.pop()) - - def maybe_load_contents(self, contents): - if self.config['send_contents']: - self.bulk_send_blobs(contents) - else: - self.log.info('Not sending contents') - - def maybe_load_directories(self, trees, objects_per_path): - if self.config['send_directories']: - self.bulk_send_trees(objects_per_path, trees) - else: - self.log.info('Not sending directories') - - def maybe_load_revisions(self, revisions): - if self.config['send_revisions']: - self.bulk_send_commits(revisions) - else: - self.log.info('Not sending revisions') - - def maybe_load_releases(self, releases): - if self.config['send_releases']: - self.bulk_send_annotated_tags(releases) - else: - self.log.info('Not sending releases') - - def maybe_load_occurrences(self, occurrences): - if self.config['send_occurrences']: - self.bulk_send_refs(occurrences) - else: - self.log.info('Not sending occurrences') - - def load(self, objects_per_type, objects_per_path): - self.maybe_load_contents(objects_per_type[GitType.BLOB]) - self.maybe_load_directories(objects_per_type[GitType.TREE], - objects_per_path) - self.maybe_load_revisions(objects_per_type[GitType.COMM]) - self.maybe_load_releases(objects_per_type[GitType.RELE]) - self.maybe_load_occurrences(objects_per_type[GitType.REFS]) - - def flush(self): - if self.config['send_contents']: - self.send_contents(self.contents.pop()) - if self.config['send_directories']: - self.send_directories(self.directories.pop()) - if self.config['send_revisions']: - self.send_revisions(self.revisions.pop()) - if self.config['send_occurrences']: - self.send_occurrences(self.occurrences.pop()) - if self.config['send_releases']: - self.send_releases(self.releases.pop()) diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py index 66b53ad..d360358 100644 --- a/swh/loader/svn/loader.py +++ b/swh/loader/svn/loader.py @@ -1,211 +1,212 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from swh.core import utils from swh.model import git, hashutil from swh.model.git import GitType -from swh.loader.svn import libloader, svn, converters +from swh.loader.core import loader +from swh.loader.svn import svn, converters def objects_per_type(objects_per_path): """Given an object dictionary returned by `swh.model.git.walk_and_compute_sha1_from_directory`, return a map grouped by type. Returns: Dictionary with keys: - GitType.BLOB: list of blobs - GitType.TREE: list of directories """ objects = { GitType.BLOB: [], GitType.TREE: [], } for tree_path in objects_per_path: objs = objects_per_path[tree_path] for obj in objs: objects[obj['type']].append(obj) return objects -class SvnLoader(libloader.SWHLoader): +class SvnLoader(loader.SWHLoader): """Svn loader to load one svn repository. """ def __init__(self, config, origin_id): super().__init__(config, revision_type='svn', origin_id=origin_id, logging_class='swh.loader.svn.SvnLoader') def check_history_not_altered(self, svnrepo, revision_start, swh_rev): """Given a svn repository, check if the history was not tampered with. """ revision_id = swh_rev['id'] parents = swh_rev['parents'] hash_data_per_revs = svnrepo.swh_hash_data_per_revision(revision_start, revision_start) rev, _, commit, objects_per_path = list(hash_data_per_revs)[0] dir_id = objects_per_path[git.ROOT_TREE_KEY][0]['sha1_git'] swh_revision = converters.build_swh_revision(svnrepo.uuid, commit, rev, dir_id, parents) swh_revision_id = git.compute_revision_sha1_git(swh_revision) return swh_revision_id == revision_id def process_svn_revisions(self, svnrepo, revision_start, revision_end, revision_parents): """Process revisions from revision_start to revision_end and send to swh for storage. At each svn revision, checkout the repository, compute the tree hash and blobs and send for swh storage to store. Then computes and yields the swh revision. Yields: swh revision """ gen_revs = svnrepo.swh_hash_data_per_revision(revision_start, revision_end) for rev, nextrev, commit, objects_per_path in gen_revs: # compute the fs tree's checksums dir_id = objects_per_path[git.ROOT_TREE_KEY][0]['sha1_git'] swh_revision = converters.build_swh_revision(svnrepo.uuid, commit, rev, dir_id, revision_parents[rev]) swh_revision['id'] = git.compute_revision_sha1_git(swh_revision) self.log.debug('rev: %s, swhrev: %s' % ( rev, hashutil.hash_to_hex(swh_revision['id']))) if nextrev: revision_parents[nextrev] = [swh_revision['id']] objects = objects_per_type(objects_per_path) self.maybe_load_contents(objects[GitType.BLOB]) self.maybe_load_directories(objects[GitType.TREE], objects_per_path) yield swh_revision def process_swh_revisions(self, svnrepo, revision_start, revision_end, revision_parents): """Process and store revision to swh (sent by by blocks of 'revision_packet_size') Returns: The latest revision stored. """ for revisions in utils.grouper( self.process_svn_revisions(svnrepo, revision_start, revision_end, revision_parents), self.config['revision_packet_size']): revs = list(revisions) self.maybe_load_revisions(revs) return revs[-1] def process_swh_occurrence(self, revision, origin): """Process and load the occurrence pointing to the latest revision. """ occ = converters.build_swh_occurrence(revision['id'], origin['id'], datetime.datetime.utcnow()) self.log.debug('occ: %s' % occ) self.maybe_load_occurrences([occ]) def process(self, svn_url, origin, destination_path): """Load a svn repository in swh. Checkout the svn repository locally in destination_path. Args: - svn_url: svn repository url to import - origin: Dictionary origin - id: origin's id - url: url origin we fetched - type: type of the origin Returns: Dictionary with the following keys: - status: mandatory, the status result as a boolean - stderr: optional when status is True, mandatory otherwise """ svnrepo = svn.SvnRepo(svn_url, origin['id'], self.storage, destination_path) try: swh_rev = svnrepo.swh_previous_revision() if swh_rev: extra_headers = dict(swh_rev['metadata']['extra_headers']) revision_start = extra_headers['svn_revision'] revision_parents = { revision_start: swh_rev['parents'] } else: revision_start = 1 revision_parents = { revision_start: [] } svnrepo.fork(revision_start) self.log.debug('svn co %s@%s' % (svn_url, revision_start)) if swh_rev and not self.check_history_not_altered(svnrepo, revision_start, swh_rev): msg = 'History of svn %s@%s history modified. Skipping...' % ( svn_url, revision_start) self.log.warn(msg) return {'status': False, 'stderr': msg} revision_end = svnrepo.head_revision() self.log.debug('[revision_start-revision_end]: [%s-%s]' % ( revision_start, revision_end)) if revision_start == revision_end and revision_start is not 1: self.log.info('%s@%s already injected.' % (svn_url, revision_end)) return {'status': True} self.log.info('Repo %s ready to be processed.' % svnrepo) # process and store revision to swh (sent by by blocks of # 'revision_packet_size') latest_rev = self.process_swh_revisions(svnrepo, revision_start, revision_end, revision_parents) self.process_swh_occurrence(latest_rev, origin) # flush eventual remaining data self.flush() finally: svnrepo.clean_fs() return {'status': True} diff --git a/swh/loader/svn/queue.py b/swh/loader/svn/queue.py deleted file mode 100644 index f32e354..0000000 --- a/swh/loader/svn/queue.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - - -class QueuePerNbElements(): - """Basic queue which holds the nb of elements it contains. - - """ - def __init__(self, max_nb_elements): - self.reset() - self.max_nb_elements = max_nb_elements - - def add(self, elements): - if not isinstance(elements, list): - elements = list(elements) - self.elements.extend(elements) - self.count += len(elements) - return self.count >= self.max_nb_elements - - def pop(self): - elements = self.elements - self.reset() - return elements - - def reset(self): - self.elements = [] - self.count = 0 - - -class QueuePerSizeAndNbUniqueElements(): - """Queue which permits to add unknown elements and holds the current - size of the queue. - - """ - def __init__(self, max_nb_elements, max_size, key): - self.reset() - self.max_nb_elements = max_nb_elements - self.max_size = max_size - self.key = key - self.keys = set() - - def _add_element(self, e): - k = e[self.key] - if k not in self.keys: - self.keys.add(k) - self.elements.append(e) - self.size += e['length'] - self.count += 1 - - def add(self, elements): - for e in elements: - self._add_element(e) - return self.size >= self.max_size or \ - self.count >= self.max_nb_elements - - def pop(self): - elements = self.elements - self.reset() - return elements - - def reset(self): - self.elements = [] - self.keys = set() - self.size = 0 - self.count = 0 - - -class QueuePerNbUniqueElements(): - """Queue which permits to add unknown elements and knows the actual - count of elements it held. - - """ - def __init__(self, max_nb_elements, key): - self.reset() - self.max_nb_elements = max_nb_elements - self.key = key - self.keys = set() - - def _add_element(self, e): - k = e[self.key] - if k not in self.keys: - self.keys.add(k) - self.elements.append(e) - self.count += 1 - - def add(self, elements): - for e in elements: - self._add_element(e) - return self.count >= self.max_nb_elements - - def pop(self): - elements = self.elements - self.reset() - return elements - - def reset(self): - self.elements = [] - self.keys = set() - self.count = 0 diff --git a/swh/loader/svn/tests/test_libloader.py b/swh/loader/svn/tests/test_libloader.py deleted file mode 100644 index d6ba91c..0000000 --- a/swh/loader/svn/tests/test_libloader.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest - -from swh.loader.svn import libloader - - -class TestLibLoader(unittest.TestCase): - @istest - def shallow_blob(self): - # when - actual_blob = libloader.shallow_blob({ - 'length': 1451, - 'sha1_git': - b'\xd1\xdd\x9a@\xeb\xf6!\x99\xd4[S\x05\xa8Y\xa3\x80\xa7\xb1;\x9c', - 'name': b'LDPCL', - 'type': b'blob', - 'sha256': - b'\xe6it!\x99\xb37UT\x8f\x0e\x8f\xd7o\x92"\xce\xa3\x1d\xd2\xe5D>M\xaaj/\x03\x138\xad\x1b', # noqa - 'perms': b'100644', - 'sha1': - b'.\x18Y\xd6M\x8c\x9a\xa4\xe1\xf1\xc7\x95\x082\xcf\xc9\xd8\nV)', - 'path': - b'/tmp/tmp.c86tq5o9.swh.loader/pkg-doc-linux/copyrights/non-free/LDPCL' # noqa - }) - - # then - self.assertEqual(actual_blob, { - 'sha1': - b'.\x18Y\xd6M\x8c\x9a\xa4\xe1\xf1\xc7\x95\x082\xcf\xc9\xd8\nV)', - 'sha1_git': - b'\xd1\xdd\x9a@\xeb\xf6!\x99\xd4[S\x05\xa8Y\xa3\x80\xa7\xb1;\x9c', - 'sha256': - b'\xe6it!\x99\xb37UT\x8f\x0e\x8f\xd7o\x92"\xce\xa3\x1d\xd2\xe5D>M\xaaj/\x03\x138\xad\x1b', # noqa - 'length': 1451, - }) - - @istest - def shallow_tree(self): - # when - actual_shallow_tree = libloader.shallow_tree({ - 'length': 1451, - 'sha1_git': - b'tree-id', - 'type': b'tree', - 'sha256': - b'\xe6it!\x99\xb37UT\x8f\x0e\x8f\xd7o\x92"\xce\xa3\x1d\xd2\xe5D>M\xaaj/\x03\x138\xad\x1b', # noqa - 'perms': b'100644', - 'sha1': - b'.\x18Y\xd6M\x8c\x9a\xa4\xe1\xf1\xc7\x95\x082\xcf\xc9\xd8\nV)', - }) - - # then - self.assertEqual(actual_shallow_tree, b'tree-id') - - @istest - def shallow_commit(self): - # when - actual_shallow_commit = libloader.shallow_commit({ - 'sha1_git': - b'\xd1\xdd\x9a@\xeb\xf6!\x99\xd4[S\x05\xa8Y\xa3\x80\xa7\xb1;\x9c', - 'type': b'commit', - 'id': b'let-me-see-some-id', - }) - - # then - self.assertEqual(actual_shallow_commit, b'let-me-see-some-id') - - @istest - def shallow_tag(self): - # when - actual_shallow_tag = libloader.shallow_tag({ - 'sha1': - b'\xd1\xdd\x9a@\xeb\xf6!\x99\xd4[S\x05\xa8Y\xa3\x80\xa7\xb1;\x9c', - 'type': b'tag', - 'id': b'this-is-not-the-id-you-are-looking-for', - }) - - # then - self.assertEqual(actual_shallow_tag, b'this-is-not-the-id-you-are-looking-for') # noqa diff --git a/swh/loader/svn/tests/test_queue.py b/swh/loader/svn/tests/test_queue.py deleted file mode 100644 index c9e6962..0000000 --- a/swh/loader/svn/tests/test_queue.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest - -from swh.loader.svn.queue import QueuePerNbElements -from swh.loader.svn.queue import QueuePerNbUniqueElements -from swh.loader.svn.queue import QueuePerSizeAndNbUniqueElements - - -class TestQueuePerNbElements(unittest.TestCase): - @istest - def simple_queue_behavior(self): - max_nb_elements = 10 - queue = QueuePerNbElements(max_nb_elements=max_nb_elements) - - elements = [1, 3, 4, 9, 20, 30, 40] - actual_threshold = queue.add(elements) - - self.assertFalse(actual_threshold, len(elements) > max_nb_elements) - - # pop returns the content and reset the queue - actual_elements = queue.pop() - self.assertEquals(actual_elements, elements) - self.assertEquals(queue.pop(), []) - - # duplicates can be integrated - new_elements = [1, 1, 3, 4, 9, 20, 30, 40, 12, 14, 2] - actual_threshold = queue.add(new_elements) - - self.assertTrue(actual_threshold) - self.assertEquals(queue.pop(), new_elements) - - # reset is destructive too - queue.add(new_elements) - queue.reset() - - self.assertEquals(queue.pop(), []) - - -def to_some_objects(elements, key): - for elt in elements: - yield {key: elt} - - -class TestQueuePerNbUniqueElements(unittest.TestCase): - @istest - def queue_with_unique_key_behavior(self): - max_nb_elements = 5 - queue = QueuePerNbUniqueElements(max_nb_elements=max_nb_elements, - key='id') - - # no duplicates - elements = list(to_some_objects([1, 1, 3, 4, 9], key='id')) - actual_threshold = queue.add(elements) - - self.assertFalse(actual_threshold, len(elements) > max_nb_elements) - - # pop returns the content and reset the queue - actual_elements = queue.pop() - self.assertEquals(actual_elements, - [{'id': 1}, {'id': 3}, {'id': 4}, {'id': 9}]) - self.assertEquals(queue.pop(), []) - - new_elements = list(to_some_objects( - [1, 3, 4, 9, 20], - key='id')) - actual_threshold = queue.add(new_elements) - - self.assertTrue(actual_threshold) - - # reset is destructive too - queue.add(new_elements) - queue.reset() - - self.assertEquals(queue.pop(), []) - - -def to_some_complex_objects(elements, key): - for elt, size in elements: - yield {key: elt, 'length': size} - - -class TestQueuePerSizeAndNbUniqueElements(unittest.TestCase): - @istest - def queue_with_unique_key_and_size_behavior(self): - max_nb_elements = 5 - max_size = 100 - queue = QueuePerSizeAndNbUniqueElements( - max_nb_elements=max_nb_elements, - max_size=max_size, - key='k') - - # size total exceeded, nb elements not reached, still the - # threshold is deemed reached - elements = list(to_some_complex_objects([(1, 10), - (2, 20), - (3, 30), - (4, 100)], key='k')) - actual_threshold = queue.add(elements) - - self.assertTrue(actual_threshold) - - # pop returns the content and reset the queue - actual_elements = queue.pop() - self.assertEquals(actual_elements, - [{'k': 1, 'length': 10}, - {'k': 2, 'length': 20}, - {'k': 3, 'length': 30}, - {'k': 4, 'length': 100}]) - self.assertEquals(queue.pop(), []) - - # size threshold not reached, nb elements reached, the - # threshold is considered reached - new_elements = list(to_some_complex_objects( - [(1, 10), (3, 5), (4, 2), (9, 1), (20, 0)], - key='k')) - actual_threshold = queue.add(new_elements) - - queue.reset() - - self.assertTrue(actual_threshold) - - # nb elements threshold not reached, nor the top number of - # elements, the threshold is not reached - new_elements = list(to_some_complex_objects( - [(1, 10)], - key='k')) - actual_threshold = queue.add(new_elements) - - self.assertFalse(actual_threshold) - - # reset is destructive too - queue.add(new_elements) - queue.reset() - - self.assertEquals(queue.pop(), [])