diff --git a/swh/journal/publisher.py b/swh/journal/publisher.py index 1c3b935..48a838a 100644 --- a/swh/journal/publisher.py +++ b/swh/journal/publisher.py @@ -1,195 +1,199 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import logging from kafka import KafkaProducer, KafkaConsumer from swh.core.config import SWHConfig from swh.storage import get_storage from swh.storage.algos import snapshot from .serializers import kafka_to_key, key_to_kafka class SWHJournalPublisher(SWHConfig): """The journal publisher is a layer in charge of: - consuming messages from topics (1 topic per object_type) - reify the object ids read from those topics (using the storage) - producing those reified objects to output topics (1 topic per object type) The main entry point for this class is the 'poll' method. """ DEFAULT_CONFIG = { 'brokers': ('list[str]', ['getty.internal.softwareheritage.org']), 'temporary_prefix': ('str', 'swh.tmp_journal.new'), 'final_prefix': ('str', 'swh.journal.objects'), 'consumer_id': ('str', 'swh.journal.publisher'), 'publisher_id': ('str', 'swh.journal.publisher'), 'object_types': ('list[str]', ['content', 'revision', 'release']), 'storage': ('dict', { 'cls': 'remote', 'args': { 'url': 'http://localhost:5002/', } }), 'max_messages': ('int', 10000), } CONFIG_BASE_FILENAME = 'journal/publisher' def __init__(self, extra_configuration=None): self.config = config = self.parse_config_file() if extra_configuration: config.update(extra_configuration) self._prepare_storage(config) self._prepare_journal(config) self.max_messages = self.config['max_messages'] def _prepare_journal(self, config): """Prepare the consumer and subscriber instances for the publisher to actually be able to discuss with the journal. """ # yes, the temporary topics contain values that are actually _keys_ self.consumer = KafkaConsumer( bootstrap_servers=config['brokers'], value_deserializer=kafka_to_key, auto_offset_reset='earliest', enable_auto_commit=False, group_id=config['consumer_id'], ) self.producer = KafkaProducer( bootstrap_servers=config['brokers'], key_serializer=key_to_kafka, value_serializer=key_to_kafka, client_id=config['publisher_id'], ) self.consumer.subscribe( topics=['%s.%s' % (config['temporary_prefix'], object_type) for object_type in config['object_types']], ) def _prepare_storage(self, config): """Prepare the storage instance needed for the publisher to be able to discuss with the storage to retrieve the objects. """ self.storage = get_storage(**config['storage']) def poll(self, max_messages=None): """Process a batch of messages from the consumer's topics. Use the storage to reify those ids. Produces back those reified objects to the production topics. This method polls a given amount of message then stops. The number of messages to consume is either provided or configured as fallback. The following method is expected to be called from within a loop. """ messages = defaultdict(list) if max_messages is None: max_messages = self.max_messages for num, message in enumerate(self.consumer): object_type = message.topic.split('.')[-1] messages[object_type].append(message.value) if num >= max_messages: break new_objects = self.process_objects(messages) self.produce_messages(new_objects) self.consumer.commit() def process_objects(self, messages): """Given a dict of messages {object type: [object id]}, reify those ids to swh object from the storage and returns a corresponding dict. Args: messages (dict): Dict of {object_type: [id-as-bytes]} Returns: Dict of {object_type: [tuple]}. object_type (str): content, revision, release tuple (bytes, dict): object id as bytes, object as swh dict. """ processors = { 'content': self.process_contents, 'revision': self.process_revisions, 'release': self.process_releases, 'snapshot': self.process_snapshots, + 'origin': self.process_origins, } return { key: processors[key](value) for key, value in messages.items() } def produce_messages(self, messages): """Produce new swh object to the producer topic. Args: messages ([dict]): Dict of {object_type: [tuple]}. object_type (str): content, revision, release tuple (bytes, dict): object id as bytes, object as swh dict. """ for object_type, objects in messages.items(): topic = '%s.%s' % (self.config['final_prefix'], object_type) for key, object in objects: self.producer.send(topic, key=key, value=object) self.producer.flush() def process_contents(self, content_objs): metadata = self.storage.content_get_metadata( (c[b'sha1'] for c in content_objs)) return [(content['sha1'], content) for content in metadata] def process_revisions(self, revision_objs): metadata = self.storage.revision_get((r[b'id'] for r in revision_objs)) return [(revision['id'], revision) for revision in metadata] def process_releases(self, release_objs): metadata = self.storage.release_get((r[b'id'] for r in release_objs)) return [(release['id'], release) for release in metadata] + def process_origins(self, origin_objs): + return origin_objs + def process_snapshots(self, snapshot_objs): metadata = [] for snap in snapshot_objs: full_obj = snapshot.snapshot_get_all_branches( self.storage, snap[b'id']) metadata.append((full_obj['id'], full_obj)) return metadata if __name__ == '__main__': logging.basicConfig( level=logging.INFO, format='%(asctime)s %(process)d %(levelname)s %(message)s' ) publisher = SWHJournalPublisher() while True: publisher.poll() diff --git a/swh/journal/tests/test_publisher.py b/swh/journal/tests/test_publisher.py index 5d2969c..b2f1b06 100644 --- a/swh/journal/tests/test_publisher.py +++ b/swh/journal/tests/test_publisher.py @@ -1,137 +1,159 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from swh.model.hashutil import hash_to_bytes from swh.journal.publisher import SWHJournalPublisher from swh.storage.in_memory import Storage CONTENTS = [ { 'length': 3, 'sha1': hash_to_bytes( '34973274ccef6ab4dfaaf86599792fa9c3fe4689'), 'sha1_git': b'foo', 'blake2s256': b'bar', 'sha256': b'baz', 'status': 'visible', }, ] REVISIONS = [ { 'id': hash_to_bytes('7026b7c1a2af56521e951c01ed20f255fa054238'), 'message': b'hello', 'date': { 'timestamp': { 'seconds': 1234567891, 'microseconds': 0, }, 'offset': 120, 'negative_utc': None, }, 'committer': None, 'committer_date': None, }, { 'id': hash_to_bytes('368a48fe15b7db2383775f97c6b247011b3f14f4'), 'message': b'hello again', 'date': { 'timestamp': { 'seconds': 1234567892, 'microseconds': 0, }, 'offset': 120, 'negative_utc': None, }, 'committer': None, 'committer_date': None, }, ] RELEASES = [ { 'id': hash_to_bytes('d81cc0710eb6cf9efd5b920a8453e1e07157b6cd'), 'name': b'v0.0.1', 'date': { 'timestamp': { 'seconds': 1234567890, 'microseconds': 0, }, 'offset': 120, 'negative_utc': None, }, }, ] +ORIGINS = [ + { + 'id': 1, + 'url': 'https://somewhere.org/den/fox', + 'type': 'git', + }, + { + 'id': 2, + 'url': 'https://overtherainbow.org/fox/den', + 'type': 'svn', + } +] + class JournalPublisherTest(SWHJournalPublisher): def parse_config_file(self): return { 'brokers': ['localhost'], 'temporary_prefix': 'swh.tmp_journal.new', 'final_prefix': 'swh.journal.objects', 'consumer_id': 'swh.journal.test.publisher', 'publisher_id': 'swh.journal.test.publisher', 'object_types': ['content'], 'max_messages': 3, } def _prepare_storage(self, config): self.storage = Storage() self.storage.content_add({'data': b'42', **c} for c in CONTENTS) self.storage.revision_add(REVISIONS) self.storage.release_add(RELEASES) def _prepare_journal(self, config): """No journal for now """ pass class TestPublisher(unittest.TestCase): def setUp(self): self.publisher = JournalPublisherTest() self.contents = [{b'sha1': c['sha1']} for c in CONTENTS] self.revisions = [{b'id': c['id']} for c in REVISIONS] self.releases = [{b'id': c['id']} for c in RELEASES] + self.origins = ORIGINS def test_process_contents(self): actual_contents = self.publisher.process_contents(self.contents) expected_contents = [(c['sha1'], c) for c in CONTENTS] self.assertEqual(actual_contents, expected_contents) def test_process_revisions(self): actual_revisions = self.publisher.process_revisions(self.revisions) expected_revisions = [(c['id'], c) for c in REVISIONS] self.assertEqual(actual_revisions, expected_revisions) def test_process_releases(self): actual_releases = self.publisher.process_releases(self.releases) expected_releases = [(c['id'], c) for c in RELEASES] self.assertEqual(actual_releases, expected_releases) + def test_process_origins(self): + actual_releases = self.publisher.process_origins(self.origins) + expected_releases = self.origins + self.assertEqual(actual_releases, expected_releases) + def test_process_objects(self): messages = { 'content': self.contents, 'revision': self.revisions, 'release': self.releases, + 'origin': self.origins, } actual_objects = self.publisher.process_objects(messages) expected_contents = [(c['sha1'], c) for c in CONTENTS] expected_revisions = [(c['id'], c) for c in REVISIONS] expected_releases = [(c['id'], c) for c in RELEASES] + expected_origins = ORIGINS expected_objects = { 'content': expected_contents, 'revision': expected_revisions, 'release': expected_releases, + 'origin': expected_origins, } self.assertEqual(actual_objects, expected_objects)