Merge pull request #358 from chinmaychandak/master

martindurant · web-flow · commit 906cea82ea0d · 2020-10-16T14:12:07.000-04:00
Make npartitions optional when streaming from Kafka and set auto.offset.reset to latest as default.
diff --git a/streamz/sources.py b/streamz/sources.py
@@ -453,16 +453,17 @@ def _close_consumer(self):
 class FromKafkaBatched(Stream):
     """Base class for both local and cluster-based batched kafka processing"""
     def __init__(self, topic, consumer_params, poll_interval='1s',
-                 npartitions=1, max_batch_size=10000, keys=False,
+                 npartitions=None, max_batch_size=10000, keys=False,
                  engine=None, **kwargs):
         self.consumer_params = consumer_params
         # Override the auto-commit config to enforce custom streamz checkpointing
         self.consumer_params['enable.auto.commit'] = 'false'
         if 'auto.offset.reset' not in self.consumer_params.keys():
-            consumer_params['auto.offset.reset'] = 'earliest'
+            consumer_params['auto.offset.reset'] = 'latest'
         self.topic = topic
         self.npartitions = npartitions
-        self.positions = [0] * npartitions
+        if self.npartitions is not None and self.npartitions <= 0:
+            raise ValueError("Number of Kafka topic partitions must be > 0.")
         self.poll_interval = convert_interval(poll_interval)
         self.max_batch_size = max_batch_size
         self.keys = keys
@@ -485,6 +486,14 @@ def checkpoint_emit(_part):
             ref = RefCounter(cb=lambda: commit(_part))
             yield self._emit(_part, metadata=[{'ref': ref}])
 
+        if self.npartitions is None:
+            kafka_cluster_metadata = self.consumer.list_topics(self.topic)
+            if self.engine == "cudf":  # pragma: no cover
+                self.npartitions = len(kafka_cluster_metadata[self.topic.encode('utf-8')])
+            else:
+                self.npartitions = len(kafka_cluster_metadata.topics[self.topic].partitions)
+        self.positions = [0] * self.npartitions
+
         tps = []
         for partition in range(self.npartitions):
             tps.append(ck.TopicPartition(self.topic, partition))
@@ -510,7 +519,9 @@ def checkpoint_emit(_part):
                     except (RuntimeError, ck.KafkaException):
                         continue
                     if 'auto.offset.reset' in self.consumer_params.keys():
-                        if self.consumer_params['auto.offset.reset'] == 'latest':
+                        if self.consumer_params['auto.offset.reset'] == 'latest' and \
+                                (self.positions == [-1001] * self.npartitions
+                                 or self.positions == [0] * self.npartitions):
                             self.positions[partition] = high
                     current_position = self.positions[partition]
                     lowest = max(current_position, low)
@@ -551,7 +562,7 @@ def start(self):
 
 @Stream.register_api(staticmethod)
 def from_kafka_batched(topic, consumer_params, poll_interval='1s',
-                       npartitions=1, start=False, dask=False,
+                       npartitions=None, start=False, dask=False,
                        max_batch_size=10000, keys=False,
                        engine=None, **kwargs):
     """ Get messages and keys (optional) from Kafka in batches
@@ -584,8 +595,11 @@ def from_kafka_batched(topic, consumer_params, poll_interval='1s',
         | group, each message will be passed to only one of them.
     poll_interval: number
         Seconds that elapse between polling Kafka for new messages
-    npartitions: int
-        Number of partitions in the topic
+    npartitions: int (None)
+        | Number of partitions in the topic.
+        | If None, streamz will poll Kafka to get the number of partitions.
+        | As of now, streamz does not support changing number of partitions on the fly.
+        | It is recommended to restart the stream after changing the number of partitions.
     start: bool (False)
         Whether to start polling upon instantiation
     max_batch_size: int
@@ -616,20 +630,19 @@ def from_kafka_batched(topic, consumer_params, poll_interval='1s',
 
         | More information at: https://rapids.ai/start.html
 
-
     Important Kafka Configurations
     ----------
-    If 'auto.offset.reset': 'latest' is set in the consumer configs,
-    the stream starts reading messages from the latest offset. Else,
-    if it's set to 'earliest', it will read from the start offset.
-
+    By default, a stream will start reading from the latest offsets
+    available. Please set 'auto.offset.reset': 'earliest' in the
+    consumer configs, if the stream needs to start processing from
+    the earliest offsets.
 
     Examples
     ----------
 
     >>> source = Stream.from_kafka_batched('mytopic',
     ...           {'bootstrap.servers': 'localhost:9092',
-    ...            'group.id': 'streamz'}, npartitions=4)  # doctest: +SKIP
+    ...            'group.id': 'streamz'})  # doctest: +SKIP
 
     """
     if dask:
diff --git a/streamz/tests/test_kafka.py b/streamz/tests/test_kafka.py
@@ -4,7 +4,6 @@
 import os
 import pytest
 import random
-import requests
 import shlex
 import subprocess
 import time
@@ -22,16 +21,6 @@
 ck = pytest.importorskip('confluent_kafka')
 
 
-def download_kafka(target):
-    r = requests.get('http://apache.mirror.globo.tech/kafka/1.0.0/'
-                     '%s.tgz' % KAFKA_FILE, stream=True)
-    with open(target, 'wb') as f:
-        for chunk in r.iter_content(2 ** 20):
-            f.write(chunk)
-    subprocess.check_call(['tar', 'xzf', KAFKA_FILE],
-                          cwd=os.path.dirname(target))
-
-
 def stop_docker(name='streamz-kafka', cid=None, let_fail=False):
     """Stop docker container with given name tag
 
@@ -61,6 +50,7 @@ def stop_docker(name='streamz-kafka', cid=None, let_fail=False):
 
 def launch_kafka():
     stop_docker(let_fail=True)
+    subprocess.call(shlex.split("docker pull spotify/kafka"))
     cmd = ("docker run -d -p 2181:2181 -p 9092:9092 --env "
            "ADVERTISED_HOST=127.0.0.1 --env ADVERTISED_PORT=9092 "
            "--name streamz-kafka spotify/kafka")
@@ -244,6 +234,61 @@ def test_kafka_dask_batch(c, s, w1, w2):
         stream.upstream.stopped = True
 
 
+def test_kafka_batch_npartitions():
+    j1 = random.randint(0, 10000)
+    ARGS1 = {'bootstrap.servers': 'localhost:9092',
+             'group.id': 'streamz-test%i' % j1,
+             'enable.auto.commit': False,
+             'auto.offset.reset': 'earliest'}
+    j2 = j1 + 1
+    ARGS2 = {'bootstrap.servers': 'localhost:9092',
+             'group.id': 'streamz-test%i' % j2,
+             'enable.auto.commit': False,
+             'auto.offset.reset': 'earliest'}
+    with kafka_service() as kafka:
+        kafka, TOPIC = kafka
+
+        TOPIC = "test-partitions"
+        subprocess.call(shlex.split("docker exec streamz-kafka "
+                                    "/opt/kafka_2.11-0.10.1.0/bin/kafka-topics.sh "
+                                    "--create --zookeeper localhost:2181 "
+                                    "--replication-factor 1 --partitions 2 "
+                                    "--topic test-partitions"))
+        time.sleep(5)
+
+        for i in range(10):
+            if i % 2 == 0:
+                kafka.produce(TOPIC, b'value-%d' % i, partition=0)
+            else:
+                kafka.produce(TOPIC, b'value-%d' % i, partition=1)
+        kafka.flush()
+
+        with pytest.raises(ValueError):
+            stream1 = Stream.from_kafka_batched(TOPIC, ARGS1,
+                                                asynchronous=True,
+                                                npartitions=0)
+            stream1.gather().sink_to_list()
+            stream1.start()
+
+        stream2 = Stream.from_kafka_batched(TOPIC, ARGS1,
+                                            asynchronous=True,
+                                            npartitions=1)
+        out2 = stream2.gather().sink_to_list()
+        stream2.start()
+        time.sleep(5)
+        assert (len(out2) == 1 and len(out2[0]) == 5)
+        stream2.upstream.stopped = True
+
+        stream3 = Stream.from_kafka_batched(TOPIC, ARGS2,
+                                            asynchronous=True,
+                                            npartitions=4)
+        out3 = stream3.gather().sink_to_list()
+        stream3.start()
+        time.sleep(5)
+        assert (len(out3) == 2 and (len(out3[0]) + len(out3[1])) == 10)
+        stream3.upstream.stopped = True
+
+
 def test_kafka_batch_checkpointing_sync_nodes():
     '''
     Streams 1 and 3 have different consumer groups, while Stream 2
@@ -254,11 +299,13 @@ def test_kafka_batch_checkpointing_sync_nodes():
     j1 = random.randint(0, 10000)
     ARGS1 = {'bootstrap.servers': 'localhost:9092',
             'group.id': 'streamz-test%i' % j1,
-            'enable.auto.commit': False}
+            'enable.auto.commit': False,
+            'auto.offset.reset': 'earliest'}
     j2 = j1 + 1
     ARGS2 = {'bootstrap.servers': 'localhost:9092',
             'group.id': 'streamz-test%i' % j2,
-            'enable.auto.commit': False}
+            'enable.auto.commit': False,
+            'auto.offset.reset': 'earliest'}
     with kafka_service() as kafka:
         kafka, TOPIC = kafka
         for i in range(10):
@@ -291,11 +338,13 @@ def test_kafka_dask_checkpointing_sync_nodes(c, s, w1, w2):
     j1 = random.randint(0, 10000)
     ARGS1 = {'bootstrap.servers': 'localhost:9092',
             'group.id': 'streamz-test%i' % j1,
-            'enable.auto.commit': False}
+            'enable.auto.commit': False,
+            'auto.offset.reset': 'earliest'}
     j2 = j1 + 1
     ARGS2 = {'bootstrap.servers': 'localhost:9092',
             'group.id': 'streamz-test%i' % j2,
-            'enable.auto.commit': False}
+            'enable.auto.commit': False,
+            'auto.offset.reset': 'earliest'}
     with kafka_service() as kafka:
         kafka, TOPIC = kafka
         for i in range(10):