sparse processing

idc9 · idc9 · commit 5fc2c28d2ae3 · 2021-07-16T19:57:34.000-04:00
diff --git a/ya_glm/extmath.py b/ya_glm/extmath.py
@@ -0,0 +1,77 @@
+from sklearn.utils.sparsefuncs import mean_variance_axis
+from scipy.sparse import issparse
+
+import numpy as np
+
+
+def weighted_mean_std(X, sample_weight=None, ddof=0, norm_weights=True):
+    """
+    Computes possible weighted mean and standard deviations of each column of a data matrix. It is safe to call this function on either a sparse or dense matrix.
+
+    Parameters
+    -----------
+    X: array-like, shape (n_samples, n_features)
+        The data matrix.
+
+    sample_weight: None, array-like shape (n_samples)
+        The optional sample weights to use.
+
+    ddof: int
+        The divisor used in calculations
+        is ``TOT_WEIGHT - ddof``, where ``TOT_WEIGHT`` is the total weight.
+        If sample_weight is None or norm_weight=True then TOT_WEIGHT = n_samples.
+        Otherwise, TOT_WEIGHT = sample_weight.sum()
+
+    norm_weights: bool
+        Ensure the TOT_WEIGHT sums to n_samples.
+
+    Output
+    ------
+    mean, std
+
+    mean: array-like, shape (n_features, )
+        The weighted mean for each feature.
+
+    std: array-like, shape (n_features, )
+        The weighted standard deviation for each feature.
+    """
+
+    n_samples = X.shape[0]
+
+    # process sample weights
+    if sample_weight is not None:
+        _sample_weight = np.array(sample_weight).reshape(-1).astype(X.dtype)
+        assert len(_sample_weight) == n_samples
+
+        # possibly normalize the weights
+        if norm_weights:
+            _sample_weight /= _sample_weight.sum()
+            _sample_weight *= n_samples
+
+        TOT_WEIGHT = _sample_weight.sum()
+
+    else:
+        TOT_WEIGHT = n_samples
+        _sample_weight = None
+
+    # sklearn has this built in for sparse matrices
+    # TODO: can we find this somewhere for dense?
+    if issparse(X):
+        # TODO: handle ddof
+        MEAN, VAR, SUM_WEIGHTS = \
+            mean_variance_axis(X=X, axis=0, weights=_sample_weight,
+                               return_sum_weights=True)
+
+        VAR *= SUM_WEIGHTS / (TOT_WEIGHT - ddof)
+        return MEAN, np.sqrt(VAR)
+
+    # unweighted, dense case
+    if sample_weight is None:
+        return X.mean(axis=0), X.std(axis=0, ddof=ddof)
+
+    else:  # weighted, dense case
+        MEAN = X.T @ _sample_weight / TOT_WEIGHT
+        VAR = ((X - MEAN) ** 2).T @ _sample_weight
+        VAR = VAR / (TOT_WEIGHT - ddof)
+
+        return MEAN, np.sqrt(VAR)
diff --git a/ya_glm/sparse_utils.py b/ya_glm/sparse_utils.py
@@ -0,0 +1,146 @@
+from scipy.sparse.linalg import LinearOperator, aslinearoperator
+from scipy.sparse import diags, issparse
+import numpy as np
+
+
+def is_sparse_or_lin_op(a):
+    return issparse(a) or isinstance(a, LinearOperator)
+
+
+def safe_hstack(tup):
+
+    if any(is_sparse_or_lin_op(t) for t in tup):
+        return HStacked(tup)
+    else:
+        return np.hstack(tup)
+
+
+class HStacked(LinearOperator):
+    """
+    Represents np.hstack
+    """
+    def __init__(self, tup):
+
+        n_rows = tup[0].shape[0]
+        self.tup_n_cols = []
+        self.tup = []
+        for t in tup:
+            assert t.shape[0] == n_rows
+            if t.ndim == 0:
+                self.tup.append(t.reshape(-1, 1))
+            else:
+                self.tup.append(t)
+
+            self.tup_n_cols.append(t.shape[1])
+
+        shape = (n_rows, sum(self.tup_n_cols))
+
+        dtype = tup[0].dtype
+        super().__init__(dtype=dtype, shape=shape)
+
+    def _matvec(self, x):
+        out = []
+        left_idx = 0
+        right_idx = 0
+        for idx, n_cols in enumerate(self.tup_n_cols):
+            right_idx += n_cols
+            out.append(self.tup[idx]  @ x[left_idx:right_idx])
+            left_idx += n_cols
+
+        return sum(o for o in out)
+
+    def _rmatvec(self, x):
+        return np.concatenate([mat.T @ x for mat in self.tup])
+
+
+class OnesOuterVec(LinearOperator):
+    """
+    Represents the outer product 1_n vec.T where 1_n is the vector of ones
+    """
+    def __init__(self, n_rows, vec):
+        self.vec = np.asarray(vec).reshape(-1)
+        shape = (n_rows, self.vec.shape[0])
+        dtype = self.vec.dtype
+        super().__init__(dtype=dtype, shape=shape)
+
+    def _matvec(self, x):
+        return np.repeat(self.vec.T.dot(x), self.shape[0])
+
+    def _rmatvec(self, x):
+        return self.vec * x.sum()
+
+
+def centered_operator(X, center):
+    return aslinearoperator(X) - OnesOuterVec(X.shape[0], center)
+
+
+def center_scale_sparse(X, X_offset=None, X_scale=None):
+    """
+    Returns a linear operator representing a centered and scaled matrix
+
+    X_cent_scale = (X - X_offset) @ diags(1 / X_scale)
+
+    Output
+    ------
+    X_cent_scale: LinearOperator
+    """
+    if X_offset is None and X_scale is None:
+        return X
+
+    if X_offset is not None and X_scale is not None:
+        X_offset_scale = X_offset / X_scale
+        X_offset_scale = np.array(X_offset_scale).reshape(-1, 1)
+
+    elif X_offset is not None:
+        X_offset_scale = X_offset
+
+    if X_scale is not None:
+        X_ = X @ diags(1 / X_scale)
+    else:
+        X_ = X
+
+    return centered_operator(X=X_, center=X_offset_scale)
+
+
+def safe_row_scaled(mat, s):
+    if is_sparse_or_lin_op(mat):
+        return RowScaled(mat=mat, s=s)
+    else:
+        return diags(s) @ mat
+
+
+def safe_col_scaled(mat, s):
+    if is_sparse_or_lin_op(mat):
+        return ColScaled(mat=mat, s=s)
+    else:
+        return mat @diags(s)
+
+
+class RowScaled(LinearOperator):
+    def __init__(self, mat, s):
+        self.s = np.array(s).reshape(-1).astype(mat.dtype)
+        assert len(self.s) == mat.shape[0]
+        self.s = diags(self.s)
+        self.mat = mat
+        super().__init__(dtype=mat.dtype, shape=mat.shape)
+
+    def _matvec(self, x):
+        return self.s @ (self.mat @ x)
+
+    def _rmatvec(self, x):
+        return self.mat.T @ (self.s @ x)
+
+
+class ColScaled(LinearOperator):
+    def __init__(self, mat, s):
+        self.s = np.array(s).reshape(-1).astype(mat.dtype)
+        assert len(self.s) == mat.shape[1]
+        self.s = diags(self.s)
+        self.mat = mat
+        super().__init__(dtype=mat.dtype, shape=mat.shape)
+
+    def _matvec(self, x):
+        return self.mat @ (self.s @ x)
+
+    def _rmatvec(self, x):
+        return self.s @ (self.mat.T @ x)