Merge pull request #363 from jbednar/periodicdataframe

martindurant · web-flow · commit caad325d4930 · 2020-09-24T09:22:03.000-04:00
Added PeriodicDataFrame
diff --git a/docs/source/collections-api.rst b/docs/source/collections-api.rst
@@ -86,6 +86,9 @@ Dataframes
    Rolling.sum
    Rolling.var
 
+.. autosummary::
+   PeriodicDataFrame
+
 .. autosummary::
    Random
 
diff --git a/docs/source/dataframes.rst b/docs/source/dataframes.rst
@@ -4,7 +4,7 @@ DataFrames
 When handling large volumes of streaming tabular data it is often more
 efficient to pass around larger Pandas dataframes with many rows each rather
 than pass around individual Python tuples or dicts.  Handling and computing on
-data with Pandas can be much faster than operating on Python objects.
+data with Pandas can be much faster than operating on individual Python objects.
 
 So one could imagine building streaming dataframe pipelines using the ``.map``
 and ``.accumulate`` streaming operators with functions that consume and produce
@@ -178,5 +178,79 @@ and ``DaskStream`` objects.
 Not Yet Supported
 -----------------
 
-Streaming dataframes algorithms do not currently pay special attention to data
+Streaming dataframe algorithms do not currently pay special attention to data
 arriving out-of-order.
+
+
+PeriodicDataFrame
+-----------------
+
+As you have seen above, Streamz can handle arbitrarily complex pipelines,
+events, and topologies, but what if you simply want to run some Python
+function periodically and collect or plot the results?
+
+streamz provides a high-level convenience class for this purpose, called
+a PeriodicDataFrame. A PeriodicDataFrame uses Python's asyncio event loop
+(used as part of Tornado in Jupyter and other interactive frameworks) to
+call a user-provided function at a regular interval, collecting the results
+and making them available for later processing.
+
+In the simplest case, you can use a PeriodicDataFrame by first writing
+a callback function like:
+
+.. code-block:: python
+
+   import numpy as np
+
+   def random_datapoint(**kwargs):
+      return pd.DataFrame({'a': np.random.random(1)}, index=[pd.Timestamp.now()])
+
+You can then make a streaming dataframe to poll this function
+e.g. every 300 milliseconds:
+
+.. code-block:: python
+
+   df = PeriodicDataFrame(random_datapoint, interval='300ms')
+
+``df`` will now be a steady stream of whatever values are returned by
+the `datafn`, which can of course be any Python code as long as it
+returns a DataFrame. 
+
+Here we returned only a single point, appropriate for streaming the
+results of system calls or other isolated actions, but any number of
+entries can be returned by the dataframe in a single batch. To
+facilitate collecting such batches, the callback is invoked with
+keyword arguments ``last`` (the time of the previous invocation) and
+``now`` (the time of the current invocation) as Pandas Timestamp
+objects. The callback can then generate or query for just the values
+in that time range.
+
+Arbitrary keyword arguments can be provided to the PeriodicDataFrame
+constructor, which will be passed into the callback so that its behavior
+can be parameterized.
+
+For instance, you can write a callback to return a suitable number of
+datapoints to keep a regularly updating stream, generated randomly
+as a batch since the last call:
+
+.. code-block:: python
+
+   def datablock(last, now, **kwargs):
+       freq = kwargs.get("freq", pd.Timedelta("50ms"))
+       index = pd.date_range(start=last + freq, end=now, freq=freq)
+       return pd.DataFrame({'x': np.random.random(len(index))}, index=index)
+
+   df = PeriodicDataFrame(datablock, interval='300ms')
+
+The callback will now be invoked every 300ms, each time generating
+datapoints at a rate of 1 every 50ms, returned as a batch. If you
+wished, you could override the 50ms value by passing
+`freq=pd.Timedelta("100ms")` to the PeriodicDataFrame constructor.
+
+Similar code could e.g. query an external database for the time range
+since the last update, returning all datapoints since then.
+
+Once you have a PeriodicDataFrame defined using such callbacks, you
+can then use all the rest of the functionality supported by streamz,
+including aggregations, rolling windows, etc., and streaming
+`visualization. <plotting>`_
diff --git a/streamz/dataframe/__init__.py b/streamz/dataframe/__init__.py
@@ -1,3 +1,3 @@
 from .core import (DataFrame, DataFrames, Frame, Frames, Series, Seriess, Index,
-                   Rolling, Window, Random, GroupBy)
+                   Rolling, Window, PeriodicDataFrame, Random, GroupBy)
 from .aggregations import Aggregation
diff --git a/streamz/dataframe/core.py b/streamz/dataframe/core.py
@@ -2,7 +2,6 @@
 
 import operator
 from collections import OrderedDict
-from time import time
 import numpy as np
 import pandas as pd
 import toolz
@@ -802,10 +801,35 @@ def _accumulate(self, Agg, **kwargs):
         return Streaming(outstream, example, stream_type=stream_type)
 
 
-def _random_df(tup):
-    last, now, freq = tup
-    index = pd.date_range(start=(last + freq.total_seconds()) * 1e9,
-                          end=now * 1e9, freq=freq)
+def random_datapoint(now, **kwargs):
+    """Example of querying a single current value"""
+    return pd.DataFrame(
+        {'a': np.random.random(1)}, index=[now])
+
+
+def random_datablock(last, now, **kwargs):
+    """
+    Example of querying over a time range since last update
+
+    Parameters
+    ----------
+    last: pd.Timestamp
+        Time of previous call to this function.
+    now: pd.Timestamp
+        Current time.
+    freq: pd.Timedelta, optional
+        The time interval between individual records to be returned.
+        For good throughput, should be much smaller than the
+        interval at which this function is called.
+
+    Returns a pd.DataFrame with random values where:
+
+    The x column is uniformly distributed.
+    The y column is Poisson distributed.
+    The z column is normally distributed.
+    """
+    freq = kwargs.get("freq", pd.Timedelta("100ms"))
+    index = pd.date_range(start=last + freq, end=now, freq=freq)
 
     df = pd.DataFrame({'x': np.random.random(len(index)),
                        'y': np.random.poisson(size=len(index)),
@@ -814,47 +838,50 @@ def _random_df(tup):
     return df
 
 
-class Random(DataFrame):
-    """ A streaming dataframe of random data
-
-    The x column is uniformly distributed.
-    The y column is poisson distributed.
-    The z column is normally distributed.
-
-    This class is experimental and will likely be removed in the future
+class PeriodicDataFrame(DataFrame):
+    """A streaming dataframe using the asyncio ioloop to poll a callback fn
 
     Parameters
     ----------
-    freq: timedelta
-        The time interval between records
+    datafn: callable
+        Callback function accepting **kwargs and returning a
+        pd.DataFrame.  kwargs will include at least
+        'last' (time.time() datafn was last invoked), and
+        'now' (current time.time()).
     interval: timedelta
-        The time interval between new dataframes, should be significantly
-        larger than freq
+        The time interval between new dataframes.
+    dask: boolean
+        If true, uses a DaskStream instead of a regular Source.
+    **kwargs:
+        Optional keyword arguments to be passed into the callback function.
+
+    By default, returns a three-column random pd.DataFrame generated
+    by the 'random_datablock' function.
 
     Example
     -------
-    >>> source = Random(freq='100ms', interval='1s')  # doctest: +SKIP
+    >>> df = PeriodicDataFrame(interval='1s', datafn=random_datapoint)  # doctest: +SKIP
     """
 
-    def __init__(self, freq='100ms', interval='500ms', dask=False):
+    def __init__(self, datafn=random_datablock, interval='500ms', dask=False, **kwargs):
         if dask:
             from streamz.dask import DaskStream
             source = DaskStream()
             loop = source.loop
         else:
             source = Source()
             loop = IOLoop.current()
-        self.freq = pd.Timedelta(freq)
         self.interval = pd.Timedelta(interval).total_seconds()
         self.source = source
         self.continue_ = [True]
+        self.kwargs = kwargs
 
-        stream = self.source.map(_random_df)
-        example = _random_df((time(), time(), self.freq))
+        stream = self.source.map(lambda x: datafn(**x, **kwargs))
+        example = datafn(last=pd.Timestamp.now(), now=pd.Timestamp.now(), **kwargs)
 
-        super(Random, self).__init__(stream, example)
+        super(PeriodicDataFrame, self).__init__(stream, example)
 
-        loop.add_callback(self._cb, self.interval, self.freq, self.source,
+        loop.add_callback(self._cb, self.interval, self.source,
                           self.continue_)
 
     def __del__(self):
@@ -865,15 +892,34 @@ def stop(self):
 
     @staticmethod
     @gen.coroutine
-    def _cb(interval, freq, source, continue_):
-        last = time()
+    def _cb(interval, source, continue_):
+        last = pd.Timestamp.now()
         while continue_[0]:
             yield gen.sleep(interval)
-            now = time()
-            yield source._emit((last, now, freq))
+            now = pd.Timestamp.now()
+            yield source._emit(dict(last=last, now=now))
             last = now
 
 
+class Random(PeriodicDataFrame):
+    """PeriodicDataFrame providing random values by default
+
+    Accepts same parameters as PeriodicDataFrame, plus
+    `freq`, a string that will be converted to a pd.Timedelta
+    and passed to the 'datafn'.
+
+    Useful mainly for examples and docs.
+
+    Example
+    -------
+    >>> source = Random(freq='100ms', interval='1s')  # doctest: +SKIP
+    """
+
+    def __init__(self, freq='100ms', interval='500ms', dask=False,
+                 datafn=random_datablock):
+        super(Random, self).__init__(datafn, interval, dask, freq=pd.Timedelta(freq))
+
+
 _stream_types['streaming'].append((is_dataframe_like, DataFrame))
 _stream_types['streaming'].append((is_index_like, Index))
 _stream_types['streaming'].append((is_series_like, Series))
diff --git a/streamz/tests/test_batch.py b/streamz/tests/test_batch.py
@@ -37,6 +37,21 @@ def test_dataframes():
     assert result.z.tolist() == [3 * i for i in range(10)]
 
 
+def test_periodic_dataframes():
+    pd = pytest.importorskip('pandas')
+    from streamz.dataframe import PeriodicDataFrame
+    from streamz.dataframe.core import random_datapoint
+    df = random_datapoint(now=pd.Timestamp.now())
+    assert len(df) == 1
+
+    def callback(now, **kwargs):
+        return pd.DataFrame(dict(x=50, index=[now]))
+
+    df = PeriodicDataFrame(callback, interval='20ms')
+    assert df.tail(0).x == 50
+    df.stop()
+
+
 def test_filter():
     a = Batch()
     f = a.filter(lambda x: x % 2 == 0)