Skip to content

Commit 5fc2c28

Browse files
committed
sparse processing
1 parent e719d2a commit 5fc2c28

2 files changed

Lines changed: 223 additions & 0 deletions

File tree

ya_glm/extmath.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
from sklearn.utils.sparsefuncs import mean_variance_axis
2+
from scipy.sparse import issparse
3+
4+
import numpy as np
5+
6+
7+
def weighted_mean_std(X, sample_weight=None, ddof=0, norm_weights=True):
8+
"""
9+
Computes possible weighted mean and standard deviations of each column of a data matrix. It is safe to call this function on either a sparse or dense matrix.
10+
11+
Parameters
12+
-----------
13+
X: array-like, shape (n_samples, n_features)
14+
The data matrix.
15+
16+
sample_weight: None, array-like shape (n_samples)
17+
The optional sample weights to use.
18+
19+
ddof: int
20+
The divisor used in calculations
21+
is ``TOT_WEIGHT - ddof``, where ``TOT_WEIGHT`` is the total weight.
22+
If sample_weight is None or norm_weight=True then TOT_WEIGHT = n_samples.
23+
Otherwise, TOT_WEIGHT = sample_weight.sum()
24+
25+
norm_weights: bool
26+
Ensure the TOT_WEIGHT sums to n_samples.
27+
28+
Output
29+
------
30+
mean, std
31+
32+
mean: array-like, shape (n_features, )
33+
The weighted mean for each feature.
34+
35+
std: array-like, shape (n_features, )
36+
The weighted standard deviation for each feature.
37+
"""
38+
39+
n_samples = X.shape[0]
40+
41+
# process sample weights
42+
if sample_weight is not None:
43+
_sample_weight = np.array(sample_weight).reshape(-1).astype(X.dtype)
44+
assert len(_sample_weight) == n_samples
45+
46+
# possibly normalize the weights
47+
if norm_weights:
48+
_sample_weight /= _sample_weight.sum()
49+
_sample_weight *= n_samples
50+
51+
TOT_WEIGHT = _sample_weight.sum()
52+
53+
else:
54+
TOT_WEIGHT = n_samples
55+
_sample_weight = None
56+
57+
# sklearn has this built in for sparse matrices
58+
# TODO: can we find this somewhere for dense?
59+
if issparse(X):
60+
# TODO: handle ddof
61+
MEAN, VAR, SUM_WEIGHTS = \
62+
mean_variance_axis(X=X, axis=0, weights=_sample_weight,
63+
return_sum_weights=True)
64+
65+
VAR *= SUM_WEIGHTS / (TOT_WEIGHT - ddof)
66+
return MEAN, np.sqrt(VAR)
67+
68+
# unweighted, dense case
69+
if sample_weight is None:
70+
return X.mean(axis=0), X.std(axis=0, ddof=ddof)
71+
72+
else: # weighted, dense case
73+
MEAN = X.T @ _sample_weight / TOT_WEIGHT
74+
VAR = ((X - MEAN) ** 2).T @ _sample_weight
75+
VAR = VAR / (TOT_WEIGHT - ddof)
76+
77+
return MEAN, np.sqrt(VAR)

ya_glm/sparse_utils.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
from scipy.sparse.linalg import LinearOperator, aslinearoperator
2+
from scipy.sparse import diags, issparse
3+
import numpy as np
4+
5+
6+
def is_sparse_or_lin_op(a):
7+
return issparse(a) or isinstance(a, LinearOperator)
8+
9+
10+
def safe_hstack(tup):
11+
12+
if any(is_sparse_or_lin_op(t) for t in tup):
13+
return HStacked(tup)
14+
else:
15+
return np.hstack(tup)
16+
17+
18+
class HStacked(LinearOperator):
19+
"""
20+
Represents np.hstack
21+
"""
22+
def __init__(self, tup):
23+
24+
n_rows = tup[0].shape[0]
25+
self.tup_n_cols = []
26+
self.tup = []
27+
for t in tup:
28+
assert t.shape[0] == n_rows
29+
if t.ndim == 0:
30+
self.tup.append(t.reshape(-1, 1))
31+
else:
32+
self.tup.append(t)
33+
34+
self.tup_n_cols.append(t.shape[1])
35+
36+
shape = (n_rows, sum(self.tup_n_cols))
37+
38+
dtype = tup[0].dtype
39+
super().__init__(dtype=dtype, shape=shape)
40+
41+
def _matvec(self, x):
42+
out = []
43+
left_idx = 0
44+
right_idx = 0
45+
for idx, n_cols in enumerate(self.tup_n_cols):
46+
right_idx += n_cols
47+
out.append(self.tup[idx] @ x[left_idx:right_idx])
48+
left_idx += n_cols
49+
50+
return sum(o for o in out)
51+
52+
def _rmatvec(self, x):
53+
return np.concatenate([mat.T @ x for mat in self.tup])
54+
55+
56+
class OnesOuterVec(LinearOperator):
57+
"""
58+
Represents the outer product 1_n vec.T where 1_n is the vector of ones
59+
"""
60+
def __init__(self, n_rows, vec):
61+
self.vec = np.asarray(vec).reshape(-1)
62+
shape = (n_rows, self.vec.shape[0])
63+
dtype = self.vec.dtype
64+
super().__init__(dtype=dtype, shape=shape)
65+
66+
def _matvec(self, x):
67+
return np.repeat(self.vec.T.dot(x), self.shape[0])
68+
69+
def _rmatvec(self, x):
70+
return self.vec * x.sum()
71+
72+
73+
def centered_operator(X, center):
74+
return aslinearoperator(X) - OnesOuterVec(X.shape[0], center)
75+
76+
77+
def center_scale_sparse(X, X_offset=None, X_scale=None):
78+
"""
79+
Returns a linear operator representing a centered and scaled matrix
80+
81+
X_cent_scale = (X - X_offset) @ diags(1 / X_scale)
82+
83+
Output
84+
------
85+
X_cent_scale: LinearOperator
86+
"""
87+
if X_offset is None and X_scale is None:
88+
return X
89+
90+
if X_offset is not None and X_scale is not None:
91+
X_offset_scale = X_offset / X_scale
92+
X_offset_scale = np.array(X_offset_scale).reshape(-1, 1)
93+
94+
elif X_offset is not None:
95+
X_offset_scale = X_offset
96+
97+
if X_scale is not None:
98+
X_ = X @ diags(1 / X_scale)
99+
else:
100+
X_ = X
101+
102+
return centered_operator(X=X_, center=X_offset_scale)
103+
104+
105+
def safe_row_scaled(mat, s):
106+
if is_sparse_or_lin_op(mat):
107+
return RowScaled(mat=mat, s=s)
108+
else:
109+
return diags(s) @ mat
110+
111+
112+
def safe_col_scaled(mat, s):
113+
if is_sparse_or_lin_op(mat):
114+
return ColScaled(mat=mat, s=s)
115+
else:
116+
return mat @diags(s)
117+
118+
119+
class RowScaled(LinearOperator):
120+
def __init__(self, mat, s):
121+
self.s = np.array(s).reshape(-1).astype(mat.dtype)
122+
assert len(self.s) == mat.shape[0]
123+
self.s = diags(self.s)
124+
self.mat = mat
125+
super().__init__(dtype=mat.dtype, shape=mat.shape)
126+
127+
def _matvec(self, x):
128+
return self.s @ (self.mat @ x)
129+
130+
def _rmatvec(self, x):
131+
return self.mat.T @ (self.s @ x)
132+
133+
134+
class ColScaled(LinearOperator):
135+
def __init__(self, mat, s):
136+
self.s = np.array(s).reshape(-1).astype(mat.dtype)
137+
assert len(self.s) == mat.shape[1]
138+
self.s = diags(self.s)
139+
self.mat = mat
140+
super().__init__(dtype=mat.dtype, shape=mat.shape)
141+
142+
def _matvec(self, x):
143+
return self.mat @ (self.s @ x)
144+
145+
def _rmatvec(self, x):
146+
return self.s @ (self.mat.T @ x)

0 commit comments

Comments
 (0)