Add refresh_cycles parameter to from_kafka_batched

chinmaychandak · chinmaychandak · commit 6a3b1ae7b6e8 · 2020-10-16T11:32:52.000-07:00
diff --git a/streamz/sources.py b/streamz/sources.py
@@ -453,7 +453,8 @@ def _close_consumer(self):
 class FromKafkaBatched(Stream):
     """Base class for both local and cluster-based batched kafka processing"""
     def __init__(self, topic, consumer_params, poll_interval='1s',
-                 npartitions=None, max_batch_size=10000, keys=False,
+                 npartitions=None, refresh_cycles=None,
+                 max_batch_size=10000, keys=False,
                  engine=None, **kwargs):
         self.consumer_params = consumer_params
         # Override the auto-commit config to enforce custom streamz checkpointing
@@ -462,6 +463,7 @@ def __init__(self, topic, consumer_params, poll_interval='1s',
             consumer_params['auto.offset.reset'] = 'latest'
         self.topic = topic
         self.npartitions = npartitions
+        self.refresh_cycles = refresh_cycles
         if self.npartitions is not None and self.npartitions <= 0:
             raise ValueError("Number of Kafka topic partitions must be > 0.")
         self.poll_interval = convert_interval(poll_interval)
@@ -509,8 +511,21 @@ def checkpoint_emit(_part):
                 break
 
         try:
+            if self.refresh_cycles is not None:
+                cycles = 0
             while not self.stopped:
                 out = []
+
+                if self.refresh_cycles is not None and cycles == 0:
+                    kafka_cluster_metadata = self.consumer.list_topics(self.topic)
+                    if self.engine == "cudf":  # pragma: no cover
+                        new_partitions = len(kafka_cluster_metadata[self.topic.encode('utf-8')])
+                    else:
+                        new_partitions = len(kafka_cluster_metadata.topics[self.topic].partitions)
+                    if new_partitions > self.npartitions:
+                        self.positions.extend([-1001] * (new_partitions - self.npartitions))
+                        self.npartitions = new_partitions
+
                 for partition in range(self.npartitions):
                     tp = ck.TopicPartition(self.topic, partition, 0)
                     try:
@@ -533,6 +548,9 @@ def checkpoint_emit(_part):
                         self.positions[partition] = high
                 self.consumer_params['auto.offset.reset'] = 'earliest'
 
+                if self.refresh_cycles is not None:
+                    cycles = (cycles + 1) % self.refresh_cycles
+
                 for part in out:
                     yield self.loop.add_callback(checkpoint_emit, part)
 
@@ -562,7 +580,8 @@ def start(self):
 
 @Stream.register_api(staticmethod)
 def from_kafka_batched(topic, consumer_params, poll_interval='1s',
-                       npartitions=None, start=False, dask=False,
+                       npartitions=None, refresh_cycles=None,
+                       start=False, dask=False,
                        max_batch_size=10000, keys=False,
                        engine=None, **kwargs):
     """ Get messages and keys (optional) from Kafka in batches
@@ -598,8 +617,14 @@ def from_kafka_batched(topic, consumer_params, poll_interval='1s',
     npartitions: int (None)
         | Number of partitions in the topic.
         | If None, streamz will poll Kafka to get the number of partitions.
-        | As of now, streamz does not support changing number of partitions on the fly.
-        | It is recommended to restart the stream after changing the number of partitions.
+     refresh_cycles: int (None)
+        | Useful if the user expects to increase the number of partitions on the fly,
+        | maybe to handle spikes in load, etc. Streamz polls Kafka after every
+        | 'refresh cycles' number of batches to determine the current number of topic
+        | partitions. If partitions have been added, streamz will automatically start
+        | reading data from the new partitions as well.
+        | If set to None, streamz will not accommodate changing partitions on the fly.
+        | It is recommended to restart the stream after decreasing the number of partitions.
     start: bool (False)
         Whether to start polling upon instantiation
     max_batch_size: int
@@ -631,7 +656,6 @@ def from_kafka_batched(topic, consumer_params, poll_interval='1s',
         | More information at: https://rapids.ai/start.html
 
     Important Kafka Configurations
-    ----------
     By default, a stream will start reading from the latest offsets
     available. Please set 'auto.offset.reset': 'earliest' in the
     consumer configs, if the stream needs to start processing from
@@ -651,6 +675,7 @@ def from_kafka_batched(topic, consumer_params, poll_interval='1s',
     source = FromKafkaBatched(topic, consumer_params,
                               poll_interval=poll_interval,
                               npartitions=npartitions,
+                              refresh_cycles=refresh_cycles,
                               max_batch_size=max_batch_size,
                               keys=keys,
                               engine=engine,
diff --git a/streamz/tests/test_kafka.py b/streamz/tests/test_kafka.py
@@ -87,7 +87,8 @@ def kafka_service():
                 "Kafka not available. "
                 "To launch kafka use `export STREAMZ_LAUNCH_KAFKA=true`")
 
-        producer = ck.Producer({'bootstrap.servers': 'localhost:9092'})
+        producer = ck.Producer({'bootstrap.servers': 'localhost:9092',
+                                'topic.metadata.refresh.interval.ms': '5000'})
         producer.produce('test-start-kafka', b'test')
         out = producer.flush(10)
         if out > 0:
@@ -289,6 +290,56 @@ def test_kafka_batch_npartitions():
         stream3.upstream.stopped = True
 
 
+def test_kafka_refresh_cycles():
+    j1 = random.randint(0, 10000)
+    ARGS = {'bootstrap.servers': 'localhost:9092',
+            'group.id': 'streamz-test%i' % j1,
+            'enable.auto.commit': False,
+            'auto.offset.reset': 'earliest'}
+    with kafka_service() as kafka:
+        kafka, TOPIC = kafka
+        TOPIC = "test-partitions"
+        subprocess.call(shlex.split("docker exec streamz-kafka "
+                                    "/opt/kafka_2.11-0.10.1.0/bin/kafka-topics.sh "
+                                    "--create --zookeeper localhost:2181 "
+                                    "--replication-factor 1 --partitions 2 "
+                                    "--topic test-partitions"))
+        time.sleep(2)
+
+        for i in range(10):
+            if i % 2 == 0:
+                kafka.produce(TOPIC, b'value-%d' % i, partition=0)
+            else:
+                kafka.produce(TOPIC, b'value-%d' % i, partition=1)
+        kafka.flush()
+
+        stream = Stream.from_kafka_batched(TOPIC, ARGS,
+                                           asynchronous=True,
+                                           refresh_cycles=1,
+                                           poll_interval='2s')
+        out = stream.gather().sink_to_list()
+        stream.start()
+        time.sleep(5)
+        assert (len(out) == 2 and (len(out[0]) + len(out[1])) == 10)
+
+        subprocess.call(shlex.split("docker exec streamz-kafka "
+                                    "/opt/kafka_2.11-0.10.1.0/bin/kafka-topics.sh "
+                                    "--alter --zookeeper localhost:2181 "
+                                    "--topic test-partitions --partitions 4"))
+        time.sleep(5)
+
+        for i in range(10,20):
+            if i % 2 == 0:
+                kafka.produce(TOPIC, b'value-%d' % i, partition=2)
+            else:
+                kafka.produce(TOPIC, b'value-%d' % i, partition=3)
+        kafka.flush()
+        time.sleep(5)
+
+        assert (len(out) == 4 and (len(out[2]) + len(out[3])) == 10)
+        stream.upstream.stopped = True
+
+
 def test_kafka_batch_checkpointing_sync_nodes():
     '''
     Streams 1 and 3 have different consumer groups, while Stream 2