Merge pull request #412 from mroeschke/feature/ewm

martindurant · web-flow · commit ae8498268405 · 2021-04-05T12:02:43.000-04:00
Add Expanding and EWM.mean
diff --git a/streamz/dataframe/aggregations.py b/streamz/dataframe/aggregations.py
@@ -148,6 +148,28 @@ def initial(self, new):
         return new.iloc[:0]
 
 
+class EWMean(Aggregation):
+    def __init__(self, com):
+        self.com = com
+        alpha = 1. / (1. + self.com)
+        self.old_wt_factor = 1. - alpha
+        self.new_wt = 1.
+
+    def on_new(self, acc, new):
+        result, old_wt, is_first = acc
+        for i in range(int(is_first), len(new)):
+            old_wt *= self.old_wt_factor
+            result = ((old_wt * result) + (self.new_wt * new.iloc[i])) / (old_wt + self.new_wt)
+            old_wt += self.new_wt
+        return (result, old_wt, False), result
+
+    def on_old(self, acc, old):
+        pass
+
+    def initial(self, new):
+        return new.iloc[:1], 1, True
+
+
 def diff_iloc(dfs, new, window=None):
     """ Emit new list of dfs and decayed data
 
@@ -223,6 +245,13 @@ def diff_loc(dfs, new, window=None):
     return dfs, old
 
 
+def diff_expanding(dfs, new, window=None):
+    dfs = deque(dfs)
+    if len(new) > 0:
+        dfs.append(new)
+    return dfs, []
+
+
 def diff_align(dfs, groupers):
     """ Align groupers to newly-diffed dataframes
 
diff --git a/streamz/dataframe/core.py b/streamz/dataframe/core.py
@@ -170,6 +170,12 @@ def window(self, n=None, value=None, with_state=False, start=None):
         """
         return Window(self, n=n, value=value, with_state=with_state, start=start)
 
+    def expanding(self, with_state=False, start=None):
+        return Expanding(self, n=1, with_state=with_state, start=start)
+
+    def ewm(self, com=None, span=None, halflife=None, alpha=None, with_state=False, start=None):
+        return EWM(self, n=1, com=com, span=span, halflife=halflife, alpha=alpha, with_state=with_state, start=start)
+
     def _cumulative_aggregation(self, op):
         return self.accumulate_partitions(_cumulative_accumulator,
                                           returns_state=True,
@@ -531,18 +537,30 @@ def __init__(self, sdf, n=None, value=None, with_state=False, start=None):
 
     def __getitem__(self, key):
         sdf = self.root[key]
-        return Window(sdf, n=self.n, value=self.value, with_state=self.with_state, start=self.start)
+        return type(self)(
+            sdf,
+            n=self.n,
+            value=self.value,
+            with_state=self.with_state,
+            start=self.start
+        )
 
     def __getattr__(self, key):
         if key in self.root.columns or not len(self.root.columns):
             return self[key]
         else:
-            raise AttributeError("Window has no attribute %r" % key)
+            raise AttributeError(f"{type(self)} has no attribute {key}")
 
     def map_partitions(self, func, *args, **kwargs):
-        args2 = [a.root if isinstance(a, Window) else a for a in args]
+        args2 = [a.root if isinstance(a, type(self)) else a for a in args]
         root = self.root.map_partitions(func, *args2, **kwargs)
-        return Window(root, n=self.n, value=self.value, with_state=self.with_state, start=self.start)
+        return type(self)(
+            root,
+            n=self.n,
+            value=self.value,
+            with_state=self.with_state,
+            start=self.start
+        )
 
     @property
     def index(self):
@@ -561,7 +579,7 @@ def example(self):
         return self.root.example
 
     def reset_index(self):
-        return Window(self.root.reset_index(), n=self.n, value=self.value)
+        return type(self)(self.root.reset_index(), n=self.n, value=self.value)
 
     def aggregate(self, agg):
         if self.n is not None:
@@ -622,6 +640,122 @@ def groupby(self, other):
                                self.with_state, self.start)
 
 
+class Expanding(Window):
+
+    def aggregate(self, agg):
+        window = self.n
+        diff = aggregations.diff_expanding
+        return self.root.accumulate_partitions(aggregations.window_accumulator,
+                                               diff=diff,
+                                               window=window,
+                                               agg=agg,
+                                               start=self.start,
+                                               returns_state=True,
+                                               stream_type='updating',
+                                               with_state=self.with_state)
+
+    def groupby(self, other):
+        raise NotImplementedError
+
+
+class EWM(Expanding):
+
+    def __init__(
+            self,
+            sdf,
+            n=1,
+            value=None,
+            with_state=False,
+            start=None,
+            com=None,
+            span=None,
+            halflife=None,
+            alpha=None
+    ):
+        super().__init__(sdf, n=n, value=value, with_state=with_state, start=start)
+        self._com = self._get_com(com, span, halflife, alpha)
+        self.com = com
+        self.span = span
+        self.alpha = alpha
+        self.halflife = halflife
+
+    def __getitem__(self, key):
+        sdf = self.root[key]
+        return type(self)(
+            sdf,
+            n=self.n,
+            value=self.value,
+            with_state=self.with_state,
+            start=self.start,
+            com=self.com,
+            span=self.span,
+            halflife=self.halflife,
+            alpha=self.alpha
+        )
+
+    @staticmethod
+    def _get_com(com, span, halflife, alpha):
+        if sum(var is not None for var in (com, span, halflife, alpha)) > 1:
+            raise ValueError("Can only provide one of `com`, `span`, `halflife`, `alpha`.")
+        # Convert to center of mass; domain checks ensure 0 < alpha <= 1
+        if com is not None:
+            if com < 0:
+                raise ValueError("com must satisfy: comass >= 0")
+        elif span is not None:
+            if span < 1:
+                raise ValueError("span must satisfy: span >= 1")
+            com = (span - 1) / 2
+        elif halflife is not None:
+            if halflife <= 0:
+                raise ValueError("halflife must satisfy: halflife > 0")
+            decay = 1 - np.exp(np.log(0.5) / halflife)
+            com = 1 / decay - 1
+        elif alpha is not None:
+            if alpha <= 0 or alpha > 1:
+                raise ValueError("alpha must satisfy: 0 < alpha <= 1")
+            com = (1 - alpha) / alpha
+        else:
+            raise ValueError("Must pass one of com, span, halflife, or alpha")
+
+        return float(com)
+
+    def full(self):
+        raise NotImplementedError
+
+    def apply(self, func):
+        """ Apply an arbitrary function over each window of data """
+        raise NotImplementedError
+
+    def sum(self):
+        """ Sum elements within window """
+        raise NotImplementedError
+
+    def count(self):
+        """ Count elements within window """
+        raise NotImplementedError
+
+    def mean(self):
+        """ Average elements within window """
+        return self.aggregate(aggregations.EWMean(self._com))
+
+    def var(self, ddof=1):
+        """ Compute variance of elements within window """
+        raise NotImplementedError
+
+    def std(self, ddof=1):
+        """ Compute standard deviation of elements within window """
+        raise NotImplementedError
+
+    @property
+    def size(self):
+        """ Number of elements within window """
+        raise NotImplementedError
+
+    def value_counts(self):
+        """ Count groups of elements within window """
+        raise NotImplementedError
+
+
 def rolling_accumulator(acc, new, window=None, op=None,
                         with_state=False, args=(), kwargs={}):
     if len(acc):
diff --git a/streamz/dataframe/tests/test_dataframes.py b/streamz/dataframe/tests/test_dataframes.py
@@ -709,6 +709,75 @@ def test_windowing_n(func, n, getter):
     assert_eq(L[-1], func(getter(df).iloc[len(df) - n:] + 10))
 
 
+@pytest.mark.parametrize('func', [
+    lambda x: x.sum(),
+    lambda x: x.mean(),
+    lambda x: x.count(),
+    lambda x: x.var(ddof=1),
+    lambda x: x.std(ddof=1),
+    lambda x: x.var(ddof=0),
+], ids=["sum", "mean", "count", "var_1", "std", "var_0"])
+def test_expanding(func):
+    df = pd.DataFrame({'x': [1.], 'y': [2.]})
+    sdf = DataFrame(example=df)
+
+    L = func(sdf.expanding()).stream.gather().sink_to_list()
+
+    for i in range(5):
+        sdf.emit(df)
+
+    result = pd.concat(L, axis=1).T.astype(float)
+    expected = func(pd.concat([df] * 5, ignore_index=True).expanding())
+    assert_eq(result, expected)
+
+
+def test_ewm_mean():
+    sdf = DataFrame(example=pd.DataFrame(columns=['x', 'y']))
+    L = sdf.ewm(1).mean().stream.gather().sink_to_list()
+    sdf.emit(pd.DataFrame({'x': [1.], 'y': [2.]}))
+    sdf.emit(pd.DataFrame({'x': [2.], 'y': [3.]}))
+    sdf.emit(pd.DataFrame({'x': [3.], 'y': [4.]}))
+    result = pd.concat(L, ignore_index=True)
+
+    df = pd.DataFrame({'x': [1., 2., 3.], 'y': [2., 3., 4.]})
+    expected = df.ewm(1).mean()
+    assert_eq(result, expected)
+
+
+def test_ewm_raise_multiple_arguments():
+    sdf = DataFrame(example=pd.DataFrame(columns=['x', 'y']))
+    with pytest.raises(ValueError, match="Can only provide one of"):
+        sdf.ewm(com=1, halflife=1)
+
+
+def test_ewm_raise_no_argument():
+    sdf = DataFrame(example=pd.DataFrame(columns=['x', 'y']))
+    with pytest.raises(ValueError, match="Must pass one of"):
+        sdf.ewm()
+
+
+@pytest.mark.parametrize("arg", ["com", "halflife", "alpha", "span"])
+def test_raise_invalid_argument(arg):
+    sdf = DataFrame(example=pd.DataFrame(columns=['x', 'y']))
+    param = {arg: -1}
+    with pytest.raises(ValueError):
+        sdf.ewm(**param)
+
+
+@pytest.mark.parametrize('func', [
+    lambda x: x.sum(),
+    lambda x: x.count(),
+    lambda x: x.apply(lambda x: x),
+    lambda x: x.full(),
+    lambda x: x.var(),
+    lambda x: x.std()
+], ids=["sum", "count", "apply", "full", "var", "std"])
+def test_ewm_notimplemented(func):
+    sdf = DataFrame(example=pd.DataFrame(columns=['x', 'y']))
+    with pytest.raises(NotImplementedError):
+        func(sdf.ewm(1))
+
+
 @pytest.mark.parametrize('func', [
     lambda x: x.sum(),
     lambda x: x.mean(),