diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,4 +1,4 @@ pytest -swh.model +swh.model >= 0.0.32 pytest-kafka hypothesis diff --git a/swh/journal/backfill.py b/swh/journal/backfill.py new file mode 100644 --- /dev/null +++ b/swh/journal/backfill.py @@ -0,0 +1,463 @@ +# Copyright (C) 2017-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""The backfiller's goal is to produce part or all of the swh objects +from the storage to the journal. + +At the moment, a first implementation is the JournalBackfiller. It +reads the objects from the storage and sends every inflated object to +the journal. + +""" + +import logging + +from .direct_writer import DirectKafkaWriter + +from swh.core.db import BaseDb +from swh.storage.converters import db_to_release, db_to_revision + + +logger = logging.getLogger(__name__) + +PARTITION_KEY = { + 'content': 'sha1', + 'skipped_content': None, # unused + 'directory': 'id', + 'revision': 'revision.id', + 'release': 'release.id', + 'snapshot': 'id', + 'origin': 'id', + 'origin_visit': 'origin_visit.origin', +} + +COLUMNS = { + 'content': [ + 'sha1', 'sha1_git', 'sha256', 'blake2s256', 'length', 'status', + 'ctime' + ], + 'skipped_content': [ + 'sha1', 'sha1_git', 'sha256', 'blake2s256', 'length', 'ctime', + 'status', 'reason', + ], + 'directory': ['id', 'dir_entries', 'file_entries', 'rev_entries'], + 'revision': [ + ("revision.id", "id"), + "date", + "date_offset", + "committer_date", + "committer_date_offset", + "type", + "directory", + "message", + "synthetic", + "metadata", + "date_neg_utc_offset", + "committer_date_neg_utc_offset", + ("array(select parent_id::bytea from revision_history rh " + "where rh.id = revision.id order by rh.parent_rank asc)", + "parents"), + ("a.id", "author_id"), + ("a.name", "author_name"), + ("a.email", "author_email"), + ("a.fullname", "author_fullname"), + ("c.id", "committer_id"), + ("c.name", "committer_name"), + ("c.email", "committer_email"), + ("c.fullname", "committer_fullname"), + ], + 'release': [ + ("release.id", "id"), + "date", + "date_offset", + "comment", + ("release.name", "name"), + "synthetic", + "date_neg_utc_offset", + "target", + "target_type", + ("a.id", "author_id"), + ("a.name", "author_name"), + ("a.email", "author_email"), + ("a.fullname", "author_fullname"), + ], + 'snapshot': ['id', 'object_id'], + 'origin': ['type', 'url'], + 'origin_visit': ['type', 'url', 'date', 'snapshot', 'status'], +} + + +JOINS = { + 'release': ['person a on release.author=a.id'], + 'revision': ['person a on revision.author=a.id', + 'person c on revision.committer=c.id'], + 'origin_visit': ['origin on origin_visit.origin=origin.id'], +} + + +def directory_converter(db, directory): + """Convert directory from the flat representation to swh model + compatible objects. + + """ + columns = ['target', 'name', 'perms'] + query_template = ''' + select %(columns)s + from directory_entry_%(type)s + where id in %%s + ''' + + types = ['file', 'dir', 'rev'] + + entries = [] + with db.cursor() as cur: + for type in types: + ids = directory.pop('%s_entries' % type) + if not ids: + continue + query = query_template % { + 'columns': ','.join(columns), + 'type': type, + } + cur.execute(query, (tuple(ids), )) + for row in cur: + entry = dict(zip(columns, row)) + entry['type'] = type + entries.append(entry) + + directory['entries'] = entries + return directory + + +def revision_converter(db, revision): + """Convert revision from the flat representation to swh model + compatible objects. + + """ + revision = db_to_revision(revision) + if 'author' in revision and revision['author']: + del revision['author']['id'] + if 'committer' in revision and revision['committer']: + del revision['committer']['id'] + return revision + + +def release_converter(db, release): + """Convert release from the flat representation to swh model + compatible objects. + + """ + release = db_to_release(release) + if 'author' in release and release['author']: + del release['author']['id'] + return release + + +def snapshot_converter(db, snapshot): + """Convert snapshot from the flat representation to swh model + compatible objects. + + """ + columns = ['name', 'target', 'target_type'] + query = ''' + select %s + from snapshot_branches sbs + inner join snapshot_branch sb on sb.object_id=sbs.branch_id + where sbs.snapshot_id=%%s + ''' % ', '.join(columns) + with db.cursor() as cur: + cur.execute(query, (snapshot.pop('object_id'), )) + branches = {} + for name, *row in cur: + branch = dict(zip(columns[1:], row)) + if not branch['target'] and not branch['target_type']: + branch = None + branches[name] = branch + + snapshot['branches'] = branches + return snapshot + + +def origin_visit_converter(db, origin_visit): + origin = { + 'type': origin_visit.pop('type'), + 'url': origin_visit.pop('url'), + } + origin_visit['origin'] = origin + return origin_visit + + +CONVERTERS = { + 'directory': directory_converter, + 'revision': revision_converter, + 'release': release_converter, + 'snapshot': snapshot_converter, + 'origin_visit': origin_visit_converter, +} + +def _compute_shift_bits(numbits): + """Compute length and shift bits from numbits + + """ + q, r = divmod(numbits, 8) + length = q + (r != 0) + shift_bits = 8 - r if r else 0 + return length, shift_bits + + +def object_to_offset(object_id, numbits): + """Compute the index of the range containing object id, when dividing + space into 2^numbits. + + Args: + object_id (str): The hex representation of object_id + numbits (int): Number of bits in which we divide input space + + Returns: + The index of the range containing object id + + """ + length, shift_bits = _compute_shift_bits(numbits) + truncated_id = object_id[:length * 2] + if len(truncated_id) < length * 2: + truncated_id += '0' * (length * 2 - len(truncated_id)) + + truncated_id_bytes = bytes.fromhex(truncated_id) + return int.from_bytes(truncated_id_bytes, byteorder='big') >> shift_bits + + +def byte_ranges(numbits, start_object=None, end_object=None): + """Generate start/end pairs of bytes spanning numbits bits and + constrained by optional start_object and end_object. + + Args: + numbits (int): Number of bits in which we divide input space + start_object (str): Hex object id contained in the first range + returned + end_object (str): Hex object id contained in the last range + returned + + Yields: + 2^numbits pairs of bytes + + """ + length, shift_bits = _compute_shift_bits(numbits) + + def to_bytes(i): + return int.to_bytes(i << shift_bits, length=length, byteorder='big') + + start_offset = 0 + end_offset = 1 << numbits + + if start_object is not None: + start_offset = object_to_offset(start_object, numbits) + if end_object is not None: + end_offset = object_to_offset(end_object, numbits) + 1 + + for start in range(start_offset, end_offset): + end = start + 1 + + if start == 0: + yield None, to_bytes(end) + elif end == 1 << numbits: + yield to_bytes(start), None + else: + yield to_bytes(start), to_bytes(end) + + +def integer_ranges(start, end, block_size=1000): + """Compute range of integers between [start, end]. + + """ + for start in range(start, end, block_size): + if start == 0: + yield None, block_size + elif start + block_size > end: + yield start, end + else: + yield start, start + block_size + + +RANGE_GENERATORS = { + 'content': lambda start, end: byte_ranges(24, start, end), + 'skipped_content': lambda start, end: [(None, None)], + 'directory': lambda start, end: byte_ranges(24, start, end), + 'revision': lambda start, end: byte_ranges(24, start, end), + 'release': lambda start, end: byte_ranges(16, start, end), + 'snapshot': lambda start, end: byte_ranges(16, start, end), + 'origin': integer_ranges, + 'origin_visit': integer_ranges, +} + + +def compute_query(obj_type, start, end): + """Compute the query for object_type between start and end range. + + """ + columns = COLUMNS.get(obj_type) + join_specs = JOINS.get(obj_type, []) + join_clause = '\n'.join('left join %s' % clause for clause in join_specs) + + where = [] + where_args = [] + if start: + where.append('%(keys)s >= %%s') + where_args.append(start) + if end: + where.append('%(keys)s < %%s') + where_args.append(end) + + where_clause = '' + if where: + where_clause = ('where ' + ' and '.join(where)) % { + 'keys': '(%s)' % PARTITION_KEY[obj_type] + } + + column_specs = [] + column_aliases = [] + for column in columns: + if isinstance(column, str): + column_specs.append(column) + column_aliases.append(column) + else: + column_specs.append('%s as %s' % column) + column_aliases.append(column[1]) + + query = ''' +select %(columns)s +from %(table)s +%(join)s +%(where)s + ''' % { + 'columns': ','.join(column_specs), + 'table': obj_type, + 'join': join_clause, + 'where': where_clause, + } + + return query, where_args, column_aliases + + +def fetch(db, obj_type, start, end): + """Fetch all obj_type's identifiers from db. + + This opens one connection, stream objects and when done, close + the connection. + + Args: + db (BaseDb): Db connection object + obj_type (str): Object type + start (Union[bytes|Tuple]): Range start identifier + end (Union[bytes|Tuple]): Range end identifier + + Raises: + ValueError if obj_type is not supported + + Yields: + Objects in the given range + + """ + query, where_args, column_aliases = compute_query(obj_type, start, end) + converter = CONVERTERS.get(obj_type) + with db.cursor() as cursor: + logger.debug('Fetching data for table %s', obj_type) + logger.debug('query: %s %s', query, where_args) + cursor.execute(query, where_args) + for row in cursor: + record = dict(zip(column_aliases, row)) + if converter: + record = converter(db, record) + + logger.debug('record: %s' % record) + yield record + + +MANDATORY_KEYS = ['brokers', 'storage_dbconn', 'final_prefix', 'client_id'] + + +class JournalBackfiller: + """Read the storage's objects and send to the journal's topics. + + This is designed to be run periodically. + + """ + def __init__(self, config=None): + self.config = config + self.check_config(config) + + def check_config(self, config): + missing_keys = [] + for key in MANDATORY_KEYS: + if not config.get(key): + missing_keys.append(key) + + if missing_keys: + raise ValueError( + 'Configuration error: The following keys must be' + ' provided: %s' % (','.join(missing_keys), )) + + def parse_arguments(self, object_type, start_object, end_object): + """Parse arguments + + Raises: + ValueError for unsupported object type + ValueError if object ids are not parseable + + Returns: + Parsed start and end object ids + + """ + if object_type not in COLUMNS: + raise ValueError('Object type %s is not supported. ' + 'The only possible values are %s' % ( + object_type, ', '.join(COLUMNS.keys()))) + + if object_type in ['origin', 'origin_visit']: + if start_object: + start_object = int(start_object) + else: + start_object = 0 + if end_object: + end_object = int(end_object) + else: + end_object = 100 * 1000 * 1000 # hard-coded limit + + return start_object, end_object + + def run(self, object_type, start_object, end_object, dry_run=False): + """Read storage's subscribed object types and send them to the + journal's reading topic. + + """ + start_object, end_object = self.parse_arguments( + object_type, start_object, end_object) + + db = BaseDb.connect(self.config['storage_dbconn']) + writer = DirectKafkaWriter( + brokers=self.config['brokers'], + prefix=self.config['final_prefix'], + client_id=self.config['client_id'] + ) + for range_start, range_end in RANGE_GENERATORS[object_type]( + start_object, end_object): + logger.info('Processing %s range %s to %s', object_type, + range_start, range_end) + + data_sent = False + for obj in fetch( + db, object_type, start=range_start, end=range_end, + ): + if dry_run: + continue + data_sent = True + writer.write_addition(object_type=object_type, + object_=obj) + + if data_sent: + writer.producer.flush() + + +if __name__ == '__main__': + print('Please use the "swh-journal backfiller run" command') diff --git a/swh/journal/checker.py b/swh/journal/checker.py deleted file mode 100644 --- a/swh/journal/checker.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright (C) 2017 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -"""Module defining journal checker classes. - -Those checker goal is to send back all, or missing objects from the -journal queues. - -At the moment, a first naive implementation is the -SimpleCheckerProducer. It simply reads the objects from the -storage and sends every object identifier back to the journal. - -""" - -import psycopg2 - -from kafka import KafkaProducer - -from swh.core.config import SWHConfig -from .serializers import key_to_kafka - - -TYPE_TO_PRIMARY_KEY = { - 'origin': ['id'], - 'content': ['sha1', 'sha1_git', 'sha256'], - 'directory': ['id'], - 'revision': ['id'], - 'release': ['id'], - 'origin_visit': ['origin', 'visit'], - 'skipped_content': ['sha1', 'sha1_git', 'sha256'], -} - - -def entry_to_bytes(entry): - """Convert an entry coming from the database to bytes""" - if isinstance(entry, memoryview): - return entry.tobytes() - if isinstance(entry, tuple): - return [entry_to_bytes(value) for value in entry] - return entry - - -def fetch(db_conn, obj_type): - """Fetch all obj_type's identifiers from db. - - This opens one connection, stream objects and when done, close - the connection. - - Raises: - ValueError if obj_type is not supported - - Yields: - Identifiers for the specific object_type - - """ - primary_key = TYPE_TO_PRIMARY_KEY.get(obj_type) - if not primary_key: - raise ValueError('The object type %s is not supported. ' - 'Only possible values are %s' % ( - obj_type, TYPE_TO_PRIMARY_KEY.keys())) - - primary_key_str = ','.join(primary_key) - query = 'select %s from %s order by %s' % ( - primary_key_str, obj_type, primary_key_str) - server_side_cursor_name = 'swh.journal.%s' % obj_type - - with psycopg2.connect(db_conn) as db: - cursor = db.cursor(name=server_side_cursor_name) - cursor.execute(query) - for o in cursor: - yield dict(zip(primary_key, entry_to_bytes(o))) - - -class SimpleCheckerProducer(SWHConfig): - """Class in charge of reading the storage's objects and sends those - back to the publisher queue. - - This is designed to be run periodically. - - """ - DEFAULT_CONFIG = { - 'brokers': ('list[str]', ['getty.internal.softwareheritage.org']), - 'temporary_prefix': ('str', 'swh.tmp_journal.new'), - 'publisher_id': ('str', 'swh.journal.publisher.test'), - 'object_types': ('list[str]', ['content', 'revision', 'release']), - 'storage_dbconn': ('str', 'service=swh-dev'), - } - - CONFIG_BASE_FILENAME = 'journal/checker' - - def __init__(self, extra_configuration=None): - self.config = config = self.parse_config_file() - if extra_configuration: - config.update(extra_configuration) - - self.object_types = self.config['object_types'] - for obj_type in self.object_types: - if obj_type not in TYPE_TO_PRIMARY_KEY: - raise ValueError('The object type %s is not supported. ' - 'Possible values are %s' % ( - obj_type, - ', '.join(TYPE_TO_PRIMARY_KEY))) - - self.storage_dbconn = self.config['storage_dbconn'] - - self.producer = KafkaProducer( - bootstrap_servers=config['brokers'], - value_serializer=key_to_kafka, - client_id=config['publisher_id'], - ) - - def _read_storage(self): - """Read storage's objects and generates tuple as object_type, dict of - object. - - Yields: - tuple of object_type, object as dict - - """ - for obj_type in self.object_types: - for obj in fetch(self.storage_dbconn, obj_type): - yield obj_type, obj - - def run(self): - """Reads storage's subscribed object types and send them to the - publisher's reading queue. - - """ - for obj_type, obj in self._read_storage(): - topic = '%s.%s' % (self.config['temporary_prefix'], obj_type) - self.producer.send(topic, value=obj) - - -if __name__ == '__main__': - SimpleCheckerProducer().run() diff --git a/swh/journal/cli.py b/swh/journal/cli.py --- a/swh/journal/cli.py +++ b/swh/journal/cli.py @@ -10,8 +10,8 @@ from swh.core import config from swh.storage import get_storage -from swh.journal.publisher import JournalPublisher from swh.journal.replay import StorageReplayer +from swh.journal.backfill import JournalBackfiller CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) @@ -44,8 +44,10 @@ conf = config.read(config_file) ctx.ensure_object(dict) - logger = logging.getLogger(__name__) - logger.setLevel(log_level) + logging.basicConfig( + level=log_level, + format='%(asctime)s %(levelname)s %(name)s %(message)s', + ) _log = logging.getLogger('kafka') _log.setLevel(logging.INFO) @@ -54,21 +56,6 @@ ctx.obj['loglevel'] = log_level -@cli.command() -@click.pass_context -def publisher(ctx): - """Manipulate publisher - - """ - conf = ctx.obj['config'] - publisher = JournalPublisher(conf) - try: - while True: - publisher.poll() - except KeyboardInterrupt: - ctx.exit(0) - - @cli.command() @click.option('--max-messages', '-m', default=None, type=int, help='Maximum number of objects to replay. Default is to ' @@ -95,6 +82,27 @@ print('Done.') +@cli.command() +@click.argument('object_type') +@click.option('--start-object', default=None) +@click.option('--end-object', default=None) +@click.option('--dry-run', is_flag=True, default=False) +@click.pass_context +def backfiller(ctx, object_type, start_object, end_object, dry_run): + """Manipulate backfiller + + """ + conf = ctx.obj['config'] + backfiller = JournalBackfiller(conf) + try: + backfiller.run( + object_type=object_type, + start_object=start_object, end_object=end_object, + dry_run=dry_run) + except KeyboardInterrupt: + ctx.exit(0) + + def main(): return cli(auto_envvar_prefix='SWH_JOURNAL') diff --git a/swh/journal/direct_writer.py b/swh/journal/direct_writer.py --- a/swh/journal/direct_writer.py +++ b/swh/journal/direct_writer.py @@ -7,6 +7,8 @@ from kafka import KafkaProducer +from swh.model.hashutil import DEFAULT_ALGORITHMS + from .serializers import key_to_kafka, value_to_kafka logger = logging.getLogger(__name__) @@ -34,6 +36,11 @@ return object_['id'] elif object_type == 'content': return object_['sha1'] # TODO: use a dict of hashes + elif object_type == 'skipped_content': + return { + hash: object_[hash] + for hash in DEFAULT_ALGORITHMS + } elif object_type == 'origin': return {'url': object_['url'], 'type': object_['type']} elif object_type == 'origin_visit': @@ -46,7 +53,6 @@ def _sanitize_object(self, object_type, object_): if object_type == 'origin_visit': - # Compatibility with the publisher's format return { **object_, 'date': str(object_['date']), diff --git a/swh/journal/publisher.py b/swh/journal/publisher.py deleted file mode 100644 --- a/swh/journal/publisher.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (C) 2016-2019 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from collections import defaultdict -import logging - -from kafka import KafkaProducer, KafkaConsumer - -from swh.storage import get_storage -from swh.storage.algos import snapshot - -from .serializers import kafka_to_key, key_to_kafka - -logger = logging.getLogger(__name__) - - -MANDATORY_KEYS = [ - 'brokers', 'temporary_prefix', 'final_prefix', 'consumer_id', - 'publisher_id', 'object_types', 'storage' -] - - -class JournalPublisher: - """The journal publisher is a layer in charge of: - - - consuming messages from topics (1 topic per object_type) - - reify the object ids read from those topics (using the storage) - - producing those reified objects to output topics (1 topic per - object type) - - The main entry point for this class is the 'poll' method. - - """ - def __init__(self, config): - self.config = config - self.check_config(config) - self._prepare_storage(config) - self._prepare_journal(config) - self.max_messages = self.config['max_messages'] - logger.setLevel(logging.DEBUG) - - def check_config(self, config): - """Check the configuration is fine. - - If not raise an error. - - """ - missing_keys = [] - for key in MANDATORY_KEYS: - if not config.get(key): - missing_keys.append(key) - - if missing_keys: - raise ValueError( - 'Configuration error: The following keys must be' - ' provided: %s' % (','.join(missing_keys), )) - - def _prepare_journal(self, config): - """Prepare the consumer and subscriber instances for the publisher to - actually be able to discuss with the journal. - - """ - # yes, the temporary topics contain values that are actually _keys_ - self.consumer = KafkaConsumer( - bootstrap_servers=config['brokers'], - value_deserializer=kafka_to_key, - auto_offset_reset='earliest', - enable_auto_commit=False, - group_id=config['consumer_id'], - ) - self.producer = KafkaProducer( - bootstrap_servers=config['brokers'], - key_serializer=key_to_kafka, - value_serializer=key_to_kafka, - client_id=config['publisher_id'], - ) - - logger.info('Subscribing to object types event: %s' % ( - config['object_types'], )) - self.consumer.subscribe( - topics=['%s.%s' % (config['temporary_prefix'], object_type) - for object_type in config['object_types']], - ) - - def _prepare_storage(self, config): - """Prepare the storage instance needed for the publisher to be able to - discuss with the storage to retrieve the objects. - - """ - self.storage = get_storage(**config['storage']) - - def poll(self, max_messages=None): - """Process a batch of messages from the consumer's topics. Use the - storage to reify those ids. Produces back those reified - objects to the production topics. - - This method polls a given amount of message then stops. - The number of messages to consume is either provided or - configured as fallback. - - The following method is expected to be called from within a - loop. - - """ - messages = defaultdict(list) - if max_messages is None: - max_messages = self.max_messages - - for num, message in enumerate(self.consumer): - object_type = message.topic.split('.')[-1] - logger.debug('num: %s, object_type: %s, message: %s' % ( - num+1, object_type, message)) - messages[object_type].append(message.value) - if num + 1 >= self.max_messages: - break - - logger.debug('number of messages: %s', num+1) - - new_objects = self.process_objects(messages) - self.produce_messages(new_objects) - self.consumer.commit() - - def process_objects(self, messages): - """Given a dict of messages {object type: [object id]}, reify those - ids to swh object from the storage and returns a - corresponding dict. - - Args: - messages (dict): Dict of {object_type: [id-as-bytes]} - - Returns: - Dict of {object_type: [tuple]}. - - object_type (str): content, revision, release - tuple (bytes, dict): object id as bytes, object as swh dict. - - """ - processors = { - 'content': self.process_contents, - 'revision': self.process_revisions, - 'release': self.process_releases, - 'snapshot': self.process_snapshots, - 'origin': self.process_origins, - 'origin_visit': self.process_origin_visits, - } - - return { - key: processors[key](value) - for key, value in messages.items() - } - - def produce_messages(self, messages): - """Produce new swh object to the producer topic. - - Args: - messages ([dict]): Dict of {object_type: [tuple]}. - - object_type (str): content, revision, release - tuple (bytes, dict): object id as bytes, object as swh dict. - - """ - for object_type, objects in messages.items(): - topic = '%s.%s' % (self.config['final_prefix'], object_type) - for key, object in objects: - logger.debug('topic: %s, key: %s, value: %s' % ( - topic, key, object)) - self.producer.send(topic, key=key, value=object) - - self.producer.flush() - - def process_contents(self, content_objs): - logger.debug('contents: %s' % content_objs) - metadata = self.storage.content_get_metadata( - (c[b'sha1'] for c in content_objs)) - return [(content['sha1'], content) for content in metadata] - - def process_revisions(self, revision_objs): - logger.debug('revisions: %s' % revision_objs) - metadata = self.storage.revision_get((r[b'id'] for r in revision_objs)) - return [(revision['id'], revision) - for revision in metadata if revision] - - def process_releases(self, release_objs): - logger.debug('releases: %s' % release_objs) - metadata = self.storage.release_get((r[b'id'] for r in release_objs)) - return [(release['id'], release) for release in metadata] - - def process_origins(self, origin_objs): - logger.debug('origins: %s' % origin_objs) - r = [] - for o in origin_objs: - origin = {'url': o[b'url'], 'type': o[b'type']} - r.append((origin, origin)) - return r - - def process_origin_visits(self, origin_visits): - logger.debug('origin_visits: %s' % origin_visits) - metadata = [] - for ov in origin_visits: - origin_visit = self.storage.origin_visit_get_by( - ov[b'origin'], ov[b'visit']) - if origin_visit: - pk = ov[b'origin'], ov[b'visit'] - origin_visit['date'] = str(origin_visit['date']) - metadata.append((pk, origin_visit)) - return metadata - - def process_snapshots(self, snapshot_objs): - logger.debug('snapshots: %s' % snapshot_objs) - metadata = [] - for snap in snapshot_objs: - full_obj = snapshot.snapshot_get_all_branches( - self.storage, snap[b'id']) - metadata.append((full_obj['id'], full_obj)) - - return metadata - - -if __name__ == '__main__': - print('Please use the "swh-journal publisher run" command') diff --git a/swh/journal/replay.py b/swh/journal/replay.py --- a/swh/journal/replay.py +++ b/swh/journal/replay.py @@ -7,6 +7,8 @@ from kafka import KafkaConsumer +from swh.storage import HashCollision + from .serializers import kafka_to_value logger = logging.getLogger(__name__) @@ -60,15 +62,16 @@ if object_type in ('content', 'directory', 'revision', 'release', 'snapshot', 'origin'): if object_type == 'content': - method = storage.content_add_metadata + try: + storage.content_add_metadata([object_]) + except HashCollision as e: + logger.error('Hash collision: %s', e.args) else: method = getattr(storage, object_type + '_add') - method([object_]) + method([object_]) elif object_type == 'origin_visit': - origin_id = storage.origin_add_one(object_.pop('origin')) - visit = storage.origin_visit_add( - origin=origin_id, date=object_.pop('date')) - storage.origin_visit_update( - origin_id, visit['visit'], **object_) + storage.origin_visit_upsert([{ + **object_, + 'origin': storage.origin_add_one(object_['origin'])}]) else: assert False diff --git a/swh/journal/tests/conftest.py b/swh/journal/tests/conftest.py --- a/swh/journal/tests/conftest.py +++ b/swh/journal/tests/conftest.py @@ -9,7 +9,7 @@ import random import string -from kafka import KafkaProducer, KafkaConsumer +from kafka import KafkaConsumer from subprocess import Popen from typing import Tuple, Dict @@ -18,10 +18,9 @@ make_zookeeper_process, make_kafka_server, constants ) -from swh.journal.publisher import JournalPublisher from swh.model.hashutil import hash_to_bytes -from swh.journal.serializers import kafka_to_key, key_to_kafka, kafka_to_value +from swh.journal.serializers import kafka_to_key, kafka_to_value CONTENTS = [ @@ -134,26 +133,6 @@ } -class JournalPublisherTest(JournalPublisher): - """A journal publisher which override the default configuration - parsing setup. - - """ - def _prepare_storage(self, config): - super()._prepare_storage(config) - self.storage.content_add({'data': b'42', **c} for c in CONTENTS) - self.storage.revision_add(REVISIONS) - self.storage.release_add(RELEASES) - origins = self.storage.origin_add(ORIGINS) - origin_visits = [] - for i, ov in enumerate(ORIGIN_VISITS): - origin_id = origins[i]['id'] - ov = self.storage.origin_visit_add(origin_id, ov['date']) - origin_visits.append(ov) - self.origins = origins - self.origin_visits = origin_visits - - KAFKA_ROOT = os.environ.get('SWH_KAFKA_ROOT') KAFKA_ROOT = KAFKA_ROOT if KAFKA_ROOT else os.path.dirname(__file__) + '/kafka' if not os.path.exists(KAFKA_ROOT): @@ -186,8 +165,7 @@ TEST_CONFIG = { 'temporary_prefix': 'swh.tmp_journal.new', 'final_prefix': 'swh.journal.objects', - 'consumer_id': 'swh.journal.publisher', - 'publisher_id': 'swh.journal.publisher', + 'consumer_id': 'swh.journal.consumer', 'object_types': OBJECT_TYPE_KEYS.keys(), 'max_messages': 1, # will read 1 message and stops 'storage': {'cls': 'memory', 'args': {}}, @@ -197,7 +175,7 @@ @pytest.fixture def test_config(kafka_server: Tuple[Popen, int], kafka_prefix: str): - """Test configuration needed for publisher/producer/consumer + """Test configuration needed for producer/consumer """ _, port = kafka_server @@ -210,26 +188,8 @@ @pytest.fixture -def producer_to_publisher( - kafka_server: Tuple[Popen, int], - test_config: Dict, -) -> KafkaProducer: # noqa - """Producer to send message to the publisher's consumer. - - """ - _, port = kafka_server - producer = KafkaProducer( - bootstrap_servers='localhost:{}'.format(port), - key_serializer=key_to_kafka, - value_serializer=key_to_kafka, - client_id=test_config['consumer_id'], - ) - return producer - - -@pytest.fixture -def consumer_from_publisher(kafka_server: Tuple[Popen, int], - test_config: Dict) -> KafkaConsumer: +def consumer( + kafka_server: Tuple[Popen, int], test_config: Dict) -> KafkaConsumer: """Get a connected Kafka consumer. """ @@ -255,16 +215,3 @@ consumer.seek_to_beginning() return consumer - - -@pytest.fixture -def publisher(kafka_server: Tuple[Popen, int], - test_config: Dict) -> JournalPublisher: - """Test Publisher factory. We cannot use a fixture here as we need to - modify the sample. - - """ - # consumer and producer of the publisher needs to discuss with the - # right instance - publisher = JournalPublisherTest(test_config) - return publisher diff --git a/swh/journal/tests/test_backfill.py b/swh/journal/tests/test_backfill.py new file mode 100644 --- /dev/null +++ b/swh/journal/tests/test_backfill.py @@ -0,0 +1,123 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import pytest + +from swh.journal.backfill import ( + JournalBackfiller, compute_query, PARTITION_KEY +) + + +TEST_CONFIG = { + 'brokers': ['localhost'], + 'final_prefix': 'swh.tmp_journal.new', + 'client_id': 'swh.journal.client.test', + 'storage_dbconn': 'service=swh-dev', +} + + +def test_config_ko_missing_mandatory_key(): + """Missing configuration key will make the initialization fail + + """ + for key in TEST_CONFIG.keys(): + config = TEST_CONFIG.copy() + config.pop(key) + + with pytest.raises(ValueError) as e: + JournalBackfiller(config) + + error = ('Configuration error: The following keys must be' + ' provided: %s' % (','.join([key]), )) + assert e.value.args[0] == error + + +def test_config_ko_unknown_object_type(): + """Parse arguments will fail if the object type is unknown + + """ + backfiller = JournalBackfiller(TEST_CONFIG) + with pytest.raises(ValueError) as e: + backfiller.parse_arguments('unknown-object-type', 1, 2) + + error = ('Object type unknown-object-type is not supported. ' + 'The only possible values are %s' % ( + ', '.join(PARTITION_KEY))) + assert e.value.args[0] == error + + +def test_compute_query_content(): + query, where_args, column_aliases = compute_query( + 'content', '\x000000', '\x000001') + + assert where_args == ['\x000000', '\x000001'] + + assert column_aliases == [ + 'sha1', 'sha1_git', 'sha256', 'blake2s256', 'length', 'status', + 'ctime' + ] + + assert query == ''' +select sha1,sha1_git,sha256,blake2s256,length,status,ctime +from content + +where (sha1) >= %s and (sha1) < %s + ''' + + +def test_compute_query_skipped_content(): + query, where_args, column_aliases = compute_query( + 'skipped_content', None, None) + + assert where_args == [] + + assert column_aliases == [ + 'sha1', 'sha1_git', 'sha256', 'blake2s256', 'length', 'ctime', + 'status', 'reason', + ] + + assert query == ''' +select sha1,sha1_git,sha256,blake2s256,length,ctime,status,reason +from skipped_content + + + ''' + + +def test_compute_query_origin_visit(): + query, where_args, column_aliases = compute_query( + 'origin_visit', 1, 10) + + assert where_args == [1, 10] + + assert column_aliases == [ + 'type', 'url', 'date', 'snapshot', 'status' + ] + + assert query == ''' +select type,url,date,snapshot,status +from origin_visit +left join origin on origin_visit.origin=origin.id +where (origin_visit.origin) >= %s and (origin_visit.origin) < %s + ''' + + +def test_compute_query_release(): + query, where_args, column_aliases = compute_query( + 'release', '\x000002', '\x000003') + + assert where_args == ['\x000002', '\x000003'] + + assert column_aliases == [ + 'id', 'date', 'date_offset', 'comment', 'name', 'synthetic', + 'date_neg_utc_offset', 'target', 'target_type', 'author_id', + 'author_name', 'author_email', 'author_fullname'] + + assert query == ''' +select release.id as id,date,date_offset,comment,release.name as name,synthetic,date_neg_utc_offset,target,target_type,a.id as author_id,a.name as author_name,a.email as author_email,a.fullname as author_fullname +from release +left join person a on release.author=a.id +where (release.id) >= %s and (release.id) < %s + ''' # noqa diff --git a/swh/journal/tests/test_direct_writer.py b/swh/journal/tests/test_direct_writer.py --- a/swh/journal/tests/test_direct_writer.py +++ b/swh/journal/tests/test_direct_writer.py @@ -48,7 +48,7 @@ def test_direct_writer( kafka_prefix: str, kafka_server: Tuple[Popen, int], - consumer_from_publisher: KafkaConsumer): + consumer: KafkaConsumer): kafka_prefix += '.swh.journal.objects' config = { @@ -67,13 +67,13 @@ object_ = {**object_, 'ctime': datetime.datetime.now()} writer.write_addition(object_type, object_) - assert_written(consumer_from_publisher, kafka_prefix) + assert_written(consumer, kafka_prefix) def test_storage_direct_writer( kafka_prefix: str, kafka_server: Tuple[Popen, int], - consumer_from_publisher: KafkaConsumer): + consumer: KafkaConsumer): kafka_prefix += '.swh.journal.objects' config = { @@ -102,4 +102,4 @@ else: assert False, object_type - assert_written(consumer_from_publisher, kafka_prefix) + assert_written(consumer, kafka_prefix) diff --git a/swh/journal/tests/test_publisher_kafka.py b/swh/journal/tests/test_publisher_kafka.py deleted file mode 100644 --- a/swh/journal/tests/test_publisher_kafka.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (C) 2018-2019 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - - -from kafka import KafkaConsumer, KafkaProducer -from subprocess import Popen -from typing import Tuple - -from swh.journal.serializers import value_to_kafka, kafka_to_value -from swh.journal.publisher import JournalPublisher - -from .conftest import OBJECT_TYPE_KEYS - - -def assert_publish_ok(publisher: JournalPublisher, - consumer_from_publisher: KafkaConsumer, - producer_to_publisher: KafkaProducer, - test_config: dict, - object_type: str): - """Assert that publishing object in the publisher is reified and - published in output topics. - - Args: - publisher (JournalPublisher): publisher to read and write data - consumer_from_publisher (KafkaConsumer): To read data from the - publisher - producer_to_publisher (KafkaProducer): To send data to the publisher - object_type (str): Object type to look for (e.g content, revision, - etc...) - - """ - # object type's id label key - object_key_id, expected_objects = OBJECT_TYPE_KEYS[object_type] - # objects to send to the publisher - if object_key_id: - objects = [{object_key_id: c[object_key_id]} - for c in expected_objects] - else: - # TODO: add support for origin and origin_visit - return - - # send message to the publisher - for obj in objects: - producer_to_publisher.send( - '%s.%s' % (test_config['temporary_prefix'], object_type), - obj - ) - - nb_messages = len(objects) - - for _ in range(nb_messages): - publisher.poll(max_messages=1) - - # then (client reads from the messages from output topic) - expected_topic = '%s.%s' % (test_config['final_prefix'], object_type) - expected_msgs = [ - ( - object_[object_key_id], - kafka_to_value(value_to_kafka(object_)) - ) - for object_ in expected_objects] - - msgs = list(consumer_from_publisher) - assert all(msg.topic == expected_topic for msg in msgs) - assert [(msg.key, msg.value) for msg in msgs] == expected_msgs - - -def test_publish( - publisher: JournalPublisher, - kafka_server: Tuple[Popen, int], - test_config: dict, - consumer_from_publisher: KafkaConsumer, - producer_to_publisher: KafkaProducer): - """ - Reading from and writing to the journal publisher should work (contents) - - Args: - journal_publisher (JournalPublisher): publisher to read and write data - consumer_from_publisher (KafkaConsumer): To read data from publisher - producer_to_publisher (KafkaProducer): To send data to publisher - - """ - # retrieve the object types we want to test - object_types = OBJECT_TYPE_KEYS.keys() - # Now for each object type, we'll send data to the publisher and - # check that data is indeed fetched and reified in the publisher's - # output topics - for object_type in object_types: - assert_publish_ok( - publisher, consumer_from_publisher, producer_to_publisher, - test_config, object_type) diff --git a/swh/journal/tests/test_publisher_no_kafka.py b/swh/journal/tests/test_publisher_no_kafka.py deleted file mode 100644 --- a/swh/journal/tests/test_publisher_no_kafka.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (C) 2018-2019 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import pytest -import unittest - -from .conftest import ( - JournalPublisherTest, TEST_CONFIG, - CONTENTS, REVISIONS, RELEASES, ORIGINS -) -from swh.journal.publisher import MANDATORY_KEYS - - -class JournalPublisherNoKafkaInMemoryStorage(JournalPublisherTest): - """A journal publisher with: - - no kafka dependency - - in-memory storage - - """ - def check_config(self, config): - """No need to check the configuration here as we do not use kafka - - """ - pass - - def _prepare_journal(self, config): - """No journal for now - - """ - pass - - -class TestPublisherNoKafka(unittest.TestCase): - """This tests only the part not using any kafka instance - - """ - def setUp(self): - self.publisher = JournalPublisherNoKafkaInMemoryStorage(TEST_CONFIG) - self.contents = [{b'sha1': c['sha1']} for c in CONTENTS] - self.revisions = [{b'id': c['id']} for c in REVISIONS] - self.releases = [{b'id': c['id']} for c in RELEASES] - # those needs id generation from the storage - # so initialization is different than other entities - self.origins = [{b'url': o['url'], - b'type': o['type']} - for o in self.publisher.origins] - self.origin_visits = [{b'origin': ov['origin'], - b'visit': ov['visit']} - for ov in self.publisher.origin_visits] - # full objects - storage = self.publisher.storage - ovs = [] - for ov in self.origin_visits: - _ov = storage.origin_visit_get_by( - ov[b'origin'], ov[b'visit']) - _ov['date'] = str(_ov['date']) - ovs.append(_ov) - self.expected_origin_visits = ovs - - def test_process_contents(self): - actual_contents = self.publisher.process_contents(self.contents) - expected_contents = [(c['sha1'], c) for c in CONTENTS] - self.assertEqual(actual_contents, expected_contents) - - def test_process_revisions(self): - actual_revisions = self.publisher.process_revisions(self.revisions) - expected_revisions = [(c['id'], c) for c in REVISIONS] - self.assertEqual(actual_revisions, expected_revisions) - - def test_process_releases(self): - actual_releases = self.publisher.process_releases(self.releases) - expected_releases = [(c['id'], c) for c in RELEASES] - self.assertEqual(actual_releases, expected_releases) - - def test_process_origins(self): - actual_origins = self.publisher.process_origins(self.origins) - expected_origins = [({'url': o[b'url'], 'type': o[b'type']}, - {'url': o[b'url'], 'type': o[b'type']}) - for o in self.origins] - self.assertEqual(actual_origins, expected_origins) - - def test_process_origin_visits(self): - actual_ovs = self.publisher.process_origin_visits(self.origin_visits) - expected_ovs = [((ov['origin'], ov['visit']), ov) - for ov in self.expected_origin_visits] - self.assertEqual(actual_ovs, expected_ovs) - - def test_process_objects(self): - messages = { - 'content': self.contents, - 'revision': self.revisions, - 'release': self.releases, - 'origin': self.origins, - 'origin_visit': self.origin_visits, - } - - actual_objects = self.publisher.process_objects(messages) - - expected_contents = [(c['sha1'], c) for c in CONTENTS] - expected_revisions = [(c['id'], c) for c in REVISIONS] - expected_releases = [(c['id'], c) for c in RELEASES] - expected_origins = [(o, o) for o in ORIGINS] - expected_ovs = [((ov['origin'], ov['visit']), ov) - for ov in self.expected_origin_visits] - expected_objects = { - 'content': expected_contents, - 'revision': expected_revisions, - 'release': expected_releases, - 'origin': expected_origins, - 'origin_visit': expected_ovs, - } - - self.assertEqual(actual_objects, expected_objects) - - -class JournalPublisherCheckTest(JournalPublisherTest): - """A journal publisher with: - - no kafka dependency - - in-memory storage - - """ - def _prepare_journal(self, config): - """No journal for now - - """ - pass - - -def test_check_config_ok(test_config): - """Instantiate a publisher with the right config is fine - - """ - publisher = JournalPublisherCheckTest(test_config) - assert publisher is not None - - -def test_check_config_ko(test_config): - """Instantiate a publisher with the wrong config should raise - - """ - for k in MANDATORY_KEYS: - conf = test_config.copy() - conf.pop(k) - with pytest.raises(ValueError) as e: - JournalPublisherCheckTest(conf) - - error = ('Configuration error: The following keys must be' - ' provided: %s' % (','.join([k]), )) - assert e.value.args[0] == error diff --git a/swh/journal/tests/test_replay.py b/swh/journal/tests/test_replay.py --- a/swh/journal/tests/test_replay.py +++ b/swh/journal/tests/test_replay.py @@ -38,6 +38,7 @@ # Fill Kafka nb_sent = 0 + nb_visits = 0 for (object_type, (_, objects)) in OBJECT_TYPE_KEYS.items(): topic = kafka_prefix + '.' + object_type for object_ in objects: @@ -45,6 +46,9 @@ object_ = object_.copy() if object_type == 'content': object_['ctime'] = now + elif object_type == 'origin_visit': + nb_visits += 1 + object_['visit'] = nb_visits producer.send(topic, key=key, value=object_) nb_sent += 1 diff --git a/swh/journal/tests/test_write_replay.py b/swh/journal/tests/test_write_replay.py --- a/swh/journal/tests/test_write_replay.py +++ b/swh/journal/tests/test_write_replay.py @@ -6,12 +6,11 @@ from collections import namedtuple from hypothesis import given, settings, HealthCheck -from hypothesis.strategies import lists, one_of, composite +from hypothesis.strategies import lists -from swh.model.hashutil import MultiHash +from swh.model.hypothesis_strategies import object_dicts from swh.storage.in_memory import Storage -from swh.storage.tests.algos.test_snapshot import snapshots, origins -from swh.storage.tests.generate_data_test import gen_raw_content +from swh.storage import HashCollision from swh.journal.serializers import ( key_to_kafka, kafka_to_key, value_to_kafka, kafka_to_value) @@ -31,36 +30,7 @@ self._object_types = object_types -@composite -def contents(draw): - """Generate valid and consistent content. - - Context: Test purposes - - Args: - **draw**: Used by hypothesis to generate data - - Returns: - dict representing a content. - - """ - raw_content = draw(gen_raw_content()) - return { - 'data': raw_content, - 'length': len(raw_content), - 'status': 'visible', - **MultiHash.from_data(raw_content).digest() - } - - -objects = lists(one_of( - origins().map(lambda x: ('origin', x)), - snapshots().map(lambda x: ('snapshot', x)), - contents().map(lambda x: ('content', x)), -)) - - -@given(objects) +@given(lists(object_dicts())) @settings(suppress_health_check=[HealthCheck.too_slow]) def test_write_replay_same_order(objects): queue = [] @@ -78,14 +48,28 @@ storage1.journal_writer.send = send for (obj_type, obj) in objects: - method = getattr(storage1, obj_type + '_add') - method([obj]) + obj = obj.copy() + if obj_type == 'origin_visit': + origin_id = storage1.origin_add_one(obj.pop('origin')) + if 'visit' in obj: + del obj['visit'] + storage1.origin_visit_add(origin_id, **obj) + else: + method = getattr(storage1, obj_type + '_add') + try: + method([obj]) + except HashCollision: + pass storage2 = Storage() replayer = MockedStorageReplayer() replayer.poll = poll replayer.fill(storage2) - for attr in ('_contents', '_directories', '_revisions', '_releases', - '_snapshots', '_origin_visits', '_origins'): - assert getattr(storage1, attr) == getattr(storage2, attr), attr + for attr_name in ('_contents', '_directories', '_revisions', '_releases', + '_snapshots', '_origin_visits', '_origins'): + assert getattr(storage1, attr_name) == getattr(storage2, attr_name), \ + attr_name + + +# TODO: add test for hash collision