statistical-python
diff --git a/‎ya_glm/base/Glm.py‎
Lines changed: 50 additions & 35 deletions b/‎ya_glm/base/Glm.py‎
Lines changed: 50 additions & 35 deletions
diff --git a/‎ya_glm/base/GlmCV.py‎
Lines changed: 29 additions & 10 deletions b/‎ya_glm/base/GlmCV.py‎
Lines changed: 29 additions & 10 deletions
@@ -1,15 +1,16 @@
 from sklearn.base import BaseEstimator
 from sklearn.utils.validation import check_is_fitted
 from sklearn.utils.extmath import safe_sparse_dot
-from sklearn.utils.validation import check_array, FLOAT_DTYPES
+from sklearn.utils.validation import check_array, _check_sample_weight, \
+    FLOAT_DTYPES
 from scipy.linalg import svd
 
 import numpy as np
 from textwrap import dedent
 
 from ya_glm.autoassign import autoassign
 from ya_glm.processing import process_X, deprocess_fit
-from ya_glm.opt.GroupLasso import euclid_norm
+from ya_glm.opt.utils import euclid_norm
 
 
 _glm_base_params = dedent("""
@@ -33,7 +34,7 @@ class Glm(BaseEstimator):
     def __init__(self, fit_intercept=True, standardize=False, opt_kws={}):
         pass
 
-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight=None):
         """
         Fits the GLM.
 
@@ -44,22 +45,32 @@ def fit(self, X, y):
 
         y: array-like, shape (n_samples, )
             The training response data.
+
+        sample_weight: None or array-like,  shape (n_samples,)
+            Individual weights for each sample.
         """
 
-        X, y = self._validate_data(X, y)
+        X, y, sample_weight = self._validate_data(X, y,
+                                                  sample_weight=sample_weight)
 
         # TODO: do we want to give the user the option to not copy?
-        X, y, pre_pro_out = self.preprocess(X, y, copy=True)
+        X, y, pre_pro_out = self.preprocess(X=X, y=y,
+                                            sample_weight=sample_weight,
+                                            copy=True)
+
+        kws = self._get_solve_kws()
+        if sample_weight is not None:
+            kws['sample_weight'] = sample_weight
 
         coef, intercept, out_data = self.solve_glm(X=X, y=y,
-                                                   **self._get_solve_kws())
+                                                   **kws)
 
         self._set_fit(fit_out={'coef': coef, 'intercept': intercept,
                                'opt_data': out_data},
                       pre_pro_out=pre_pro_out)
         return self
 
-    def _validate_data(self, X, y, accept_sparse=False):
+    def _validate_data(self, X, y, sample_weight=None, accept_sparse=True):
         """
         Validates the X/y data. This should not change the raw input data, but may reformat the data (e.g. convert pandas to numpy).
 
@@ -76,18 +87,22 @@ def _validate_data(self, X, y, accept_sparse=False):
         X = check_array(X, accept_sparse=accept_sparse,
                         dtype=FLOAT_DTYPES)
 
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X,
+                                                 dtype=X.dtype)
+
         # make sure y is numpy and of same dtype as X
         y = np.asarray(y, dtype=X.dtype)
 
         # make sure X, y have same number of samples
         if y.shape[0] != X.shape[0]:
             raise ValueError("X and y must have the same number of rows!")
 
-        return X, y
+        return X, y, sample_weight
 
-    def preprocess(self, X, y, copy=True):
+    def preprocess(self, X, y, sample_weight=None, copy=True):
         """
-        Preprocesses the data for fitting. This method may transform the data e.g. centering and scaling X.
+        Preprocesses the data for fitting. This method may transform the data e.g. centering and scaling X. If sample weights are provided then these are used for computing weighted means / standard deviations for standardization.
 
         Parameters
         ----------
@@ -97,6 +112,9 @@ def preprocess(self, X, y, copy=True):
         y: array-like, shape (n_samples, ) or (n_samples, n_responses)
             The response data.
 
+        sample_weight: None or array-like,  shape (n_samples,)
+            Individual weights for each sample.
+
         copy: bool
             Whether or not to copy the X/y arrays or modify them in place.
 
@@ -113,38 +131,22 @@ def preprocess(self, X, y, copy=True):
         pro_pro_out: dict
             Data from preprocessing e.g. X_center, X_scale.
         """
+        groups = self.groups if hasattr(self, 'groups') else None
 
         X, out = process_X(X,
                            standardize=self.standardize,
-                           groups=self._get_groups(),
+                           groups=groups,
+                           sample_weight=sample_weight,
                            copy=copy,
                            check_input=False,
                            accept_sparse=False,  # TODO!
                            allow_const_cols=not self.fit_intercept)
 
-        y, y_out = self._process_y(y, copy=copy)
+        y, y_out = self._process_y(y, sample_weight=sample_weight, copy=copy)
         out.update(y_out)
 
         return X, y, out
 
-    # TODO: do we want this?
-    # def _maybe_get(self, param):
-    #     """
-    #     Safely gets an attribute that may not exist (e.g. like self.param). Returns None if the object does not have the attribute.
-    #     """
-    #     if hasattr(self, param):
-    #         return self.__dict__[param]
-    #     else:
-    #         return None
-    def _get_groups(self):
-        """
-        Safely gets an attribute that may not exist (e.g. like self.param). Returns None if the object does not have the attribute.
-        """
-        if hasattr(self, 'groups'):
-            return self.groups
-        else:
-            return None
-
     def _set_fit(self, fit_out, pre_pro_out):
         """
         Sets the fit.
@@ -221,7 +223,7 @@ def decision_function(self, X):
     def _more_tags(self):
         return {'requires_y': True}
 
-    def get_pen_val_max(self, X, y):
+    def get_pen_val_max(self, X, y, sample_weight=None):
         """
         Returns the largest reasonable penalty parameter for the processed data.
 
@@ -233,13 +235,20 @@ def get_pen_val_max(self, X, y):
         y: array-like, shape (n_samples, )
             The training response data.
 
+        sample_weight: None or array-like,  shape (n_samples,)
+            Individual weights for each sample.
+
         Output
         ------
         pen_val_max: float
             Largest reasonable tuning parameter value.
         """
-        X_pro, y_pro, _ = self.preprocess(X, y, copy=True)
-        return self._get_pen_val_max_from_pro(X_pro, y_pro)
+        X_pro, y_pro, _ = self.preprocess(X, y,
+                                          sample_weight=sample_weight,
+                                          copy=True)
+
+        return self._get_pen_val_max_from_pro(X_pro, y_pro,
+                                              sample_weight=sample_weight)
 
     def _get_penalty_kind(self):
         """
@@ -295,13 +304,19 @@ def transform(x):
 
         return transform
 
-    def _process_y(self, y, copy=True):
+    def _process_y(self, y, sample_weight=None, copy=True):
         """
         Parameters
         ---------
         y: array-like, shape (n_samples, ) or (n_samples, n_responses)
             The response data.
 
+        sample_weight: None or array-like,  shape (n_samples,)
+            Individual weights for each sample
+
+        copy: bool
+            Whether or not to copy the X/y arrays or modify them in place.
+
         Output
         ------
         y: array-like
@@ -315,7 +330,7 @@ def _get_solve_kws(self):
         """
         raise NotImplementedError
 
-    def _get_pen_val_max_from_pro(self, X, y):
+    def _get_pen_val_max_from_pro(self, X, y, sample_weight=None):
         """
         Computes the largest reasonable tuning parameter value.
         """
 
@@ -55,7 +55,7 @@ def __init__(self,
                  cv_pre_dispatch='2*n_jobs'):
         pass
 
-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight=None):
         """
         Runs cross-validation then refits the GLM with the selected tuning parameter.
 
@@ -66,19 +66,32 @@ def fit(self, X, y):
 
         y: array-like, shape (n_samples, )
             The training response data.
+
+        sample_weight: None or array-like,  shape (n_samples,)
+            Individual weights for each sample.
         """
 
         # check the input data
         self._check_base_estimator(self.estimator)
         est = clone(self.estimator)
-        X, y = est._validate_data(X, y)
+        X, y, sample_weight = est._validate_data(X, y,
+                                                 sample_weight=sample_weight)
 
         # set up the tuning parameter values using the processed data
-        self._set_tuning_values(X=X, y=y)
+        self._set_tuning_values(X=X, y=y, sample_weight=sample_weight)
+
+        # maybe add sample weight to fit params
+        if sample_weight is not None:
+            fit_params = {'sample_weight': sample_weight}
+        else:
+            fit_params = None
 
         # run cross-validation on the raw data
         start_time = time()
-        self.cv_results_ = self._run_cv(estimator=est, X=X, y=y, cv=self.cv)
+        self.cv_results_ = \
+            self._run_cv(estimator=est, X=X, y=y, cv=self.cv,
+                         fit_params=fit_params)
+
         self.cv_data_ = {'cv_runtime':  time() - start_time}
 
         # select best tuning parameter values
@@ -90,7 +103,7 @@ def fit(self, X, y):
 
         # refit on the raw data
         start_time = time()
-        self.best_estimator_ = est.fit(X, y)
+        self.best_estimator_ = est.fit(X, y, sample_weight=sample_weight)
         self.cv_data_['refit_runtime'] = time() - start_time
 
         return self
@@ -127,7 +140,7 @@ def check_base_estimator(self, estimator):
         """
         raise NotImplementedError
 
-    def _set_tuning_values(self, X, y):
+    def _set_tuning_values(self, X, y, **kws):
         """
         Sets the tuning parameter sequence from the transformed data.
 
@@ -138,6 +151,9 @@ def _set_tuning_values(self, X, y):
 
         y: array-like, shape (n_samples, )
             The processed training response data.
+
+        **kws:
+            Additional keyword arguments.
         """
         # subclass should overwrite
         raise NotImplementedError
@@ -182,11 +198,13 @@ def __init__(self,
                  ):
         pass
 
-    def _set_tuning_values(self, X, y):
+    def _set_tuning_values(self, X, y, sample_weight=None):
         if self.pen_vals is None:
-            pen_val_max = self.estimator.get_pen_val_max(X, y)
+            pen_val_max = self.estimator.\
+                get_pen_val_max(X=X, y=y, sample_weight=sample_weight)
         else:
             pen_val_max = None
+
         self._set_tune_from_pen_max(pen_val_max=pen_val_max)
 
     def _set_tune_from_pen_max(self, pen_val_max=None):
@@ -283,9 +301,10 @@ def _tune_pen_val(self):
         else:
             return True
 
-    def _set_tuning_values(self, X, y):
+    def _set_tuning_values(self, X, y, sample_weight=None):
         if self.pen_vals is None:
-            enet_pen_max = self.estimator.get_pen_val_max(X, y)
+            enet_pen_max = self.estimator.\
+                get_pen_val_max(X, y, sample_weight=sample_weight)
             lasso_pen_max = enet_pen_max * self.estimator.l1_ratio
         else:
             lasso_pen_max = None