|
| 1 | +"""Tests for the public preprocessor APIs.""" |
| 2 | +import unittest |
| 3 | + |
| 4 | +import numpy as np |
| 5 | +import pandas as pd |
| 6 | +from sklearn.datasets import load_breast_cancer, load_diabetes |
| 7 | + |
| 8 | +from flaml import AutoML |
| 9 | + |
| 10 | + |
| 11 | +class TestPreprocessAPI(unittest.TestCase): |
| 12 | + """Test cases for the public preprocess() API methods.""" |
| 13 | + |
| 14 | + def test_automl_preprocess_before_fit(self): |
| 15 | + """Test that calling preprocess before fit raises an error.""" |
| 16 | + automl = AutoML() |
| 17 | + X_test = np.array([[1, 2, 3], [4, 5, 6]]) |
| 18 | + |
| 19 | + with self.assertRaises(AttributeError) as context: |
| 20 | + automl.preprocess(X_test) |
| 21 | + # Check that an error is raised about not being fitted |
| 22 | + self.assertIn("fit()", str(context.exception)) |
| 23 | + |
| 24 | + def test_automl_preprocess_classification(self): |
| 25 | + """Test task-level preprocessing for classification.""" |
| 26 | + # Load dataset |
| 27 | + X, y = load_breast_cancer(return_X_y=True) |
| 28 | + X_train, y_train = X[:400], y[:400] |
| 29 | + X_test = X[400:450] |
| 30 | + |
| 31 | + # Train AutoML |
| 32 | + automl = AutoML() |
| 33 | + automl_settings = { |
| 34 | + "max_iter": 5, |
| 35 | + "task": "classification", |
| 36 | + "metric": "accuracy", |
| 37 | + "estimator_list": ["lgbm"], |
| 38 | + "verbose": 0, |
| 39 | + } |
| 40 | + automl.fit(X_train, y_train, **automl_settings) |
| 41 | + |
| 42 | + # Test task-level preprocessing |
| 43 | + X_preprocessed = automl.preprocess(X_test) |
| 44 | + |
| 45 | + # Verify the output is not None and has the right shape |
| 46 | + self.assertIsNotNone(X_preprocessed) |
| 47 | + self.assertEqual(X_preprocessed.shape[0], X_test.shape[0]) |
| 48 | + |
| 49 | + def test_automl_preprocess_regression(self): |
| 50 | + """Test task-level preprocessing for regression.""" |
| 51 | + # Load dataset |
| 52 | + X, y = load_diabetes(return_X_y=True) |
| 53 | + X_train, y_train = X[:300], y[:300] |
| 54 | + X_test = X[300:350] |
| 55 | + |
| 56 | + # Train AutoML |
| 57 | + automl = AutoML() |
| 58 | + automl_settings = { |
| 59 | + "max_iter": 5, |
| 60 | + "task": "regression", |
| 61 | + "metric": "r2", |
| 62 | + "estimator_list": ["lgbm"], |
| 63 | + "verbose": 0, |
| 64 | + } |
| 65 | + automl.fit(X_train, y_train, **automl_settings) |
| 66 | + |
| 67 | + # Test task-level preprocessing |
| 68 | + X_preprocessed = automl.preprocess(X_test) |
| 69 | + |
| 70 | + # Verify the output |
| 71 | + self.assertIsNotNone(X_preprocessed) |
| 72 | + self.assertEqual(X_preprocessed.shape[0], X_test.shape[0]) |
| 73 | + |
| 74 | + def test_automl_preprocess_with_dataframe(self): |
| 75 | + """Test task-level preprocessing with pandas DataFrame.""" |
| 76 | + # Create a simple dataset |
| 77 | + X_train = pd.DataFrame( |
| 78 | + { |
| 79 | + "feature1": [1, 2, 3, 4, 5] * 20, |
| 80 | + "feature2": [5, 4, 3, 2, 1] * 20, |
| 81 | + "category": ["a", "b", "a", "b", "a"] * 20, |
| 82 | + } |
| 83 | + ) |
| 84 | + y_train = pd.Series([0, 1, 0, 1, 0] * 20) |
| 85 | + |
| 86 | + X_test = pd.DataFrame( |
| 87 | + { |
| 88 | + "feature1": [6, 7, 8], |
| 89 | + "feature2": [1, 2, 3], |
| 90 | + "category": ["a", "b", "a"], |
| 91 | + } |
| 92 | + ) |
| 93 | + |
| 94 | + # Train AutoML |
| 95 | + automl = AutoML() |
| 96 | + automl_settings = { |
| 97 | + "max_iter": 5, |
| 98 | + "task": "classification", |
| 99 | + "metric": "accuracy", |
| 100 | + "estimator_list": ["lgbm"], |
| 101 | + "verbose": 0, |
| 102 | + } |
| 103 | + automl.fit(X_train, y_train, **automl_settings) |
| 104 | + |
| 105 | + # Test preprocessing |
| 106 | + X_preprocessed = automl.preprocess(X_test) |
| 107 | + |
| 108 | + # Verify the output - check the number of rows matches |
| 109 | + self.assertIsNotNone(X_preprocessed) |
| 110 | + preprocessed_len = len(X_preprocessed) if hasattr(X_preprocessed, "__len__") else X_preprocessed.shape[0] |
| 111 | + self.assertEqual(preprocessed_len, len(X_test)) |
| 112 | + |
| 113 | + def test_estimator_preprocess(self): |
| 114 | + """Test estimator-level preprocessing.""" |
| 115 | + # Load dataset |
| 116 | + X, y = load_breast_cancer(return_X_y=True) |
| 117 | + X_train, y_train = X[:400], y[:400] |
| 118 | + X_test = X[400:450] |
| 119 | + |
| 120 | + # Train AutoML |
| 121 | + automl = AutoML() |
| 122 | + automl_settings = { |
| 123 | + "max_iter": 5, |
| 124 | + "task": "classification", |
| 125 | + "metric": "accuracy", |
| 126 | + "estimator_list": ["lgbm"], |
| 127 | + "verbose": 0, |
| 128 | + } |
| 129 | + automl.fit(X_train, y_train, **automl_settings) |
| 130 | + |
| 131 | + # Get the trained estimator |
| 132 | + estimator = automl.model |
| 133 | + self.assertIsNotNone(estimator) |
| 134 | + |
| 135 | + # First apply task-level preprocessing |
| 136 | + X_task_preprocessed = automl.preprocess(X_test) |
| 137 | + |
| 138 | + # Then apply estimator-level preprocessing |
| 139 | + X_estimator_preprocessed = estimator.preprocess(X_task_preprocessed) |
| 140 | + |
| 141 | + # Verify the output |
| 142 | + self.assertIsNotNone(X_estimator_preprocessed) |
| 143 | + self.assertEqual(X_estimator_preprocessed.shape[0], X_test.shape[0]) |
| 144 | + |
| 145 | + def test_preprocess_pipeline(self): |
| 146 | + """Test the complete preprocessing pipeline (task-level then estimator-level).""" |
| 147 | + # Load dataset |
| 148 | + X, y = load_breast_cancer(return_X_y=True) |
| 149 | + X_train, y_train = X[:400], y[:400] |
| 150 | + X_test = X[400:450] |
| 151 | + |
| 152 | + # Train AutoML |
| 153 | + automl = AutoML() |
| 154 | + automl_settings = { |
| 155 | + "max_iter": 5, |
| 156 | + "task": "classification", |
| 157 | + "metric": "accuracy", |
| 158 | + "estimator_list": ["lgbm"], |
| 159 | + "verbose": 0, |
| 160 | + } |
| 161 | + automl.fit(X_train, y_train, **automl_settings) |
| 162 | + |
| 163 | + # Apply the complete preprocessing pipeline |
| 164 | + X_task_preprocessed = automl.preprocess(X_test) |
| 165 | + X_final = automl.model.preprocess(X_task_preprocessed) |
| 166 | + |
| 167 | + # Verify predictions work with preprocessed data |
| 168 | + # The internal predict already does this preprocessing, |
| 169 | + # but we verify our manual preprocessing gives consistent results |
| 170 | + y_pred_manual = automl.model._model.predict(X_final) |
| 171 | + y_pred_auto = automl.predict(X_test) |
| 172 | + |
| 173 | + # Both should give the same predictions |
| 174 | + np.testing.assert_array_equal(y_pred_manual, y_pred_auto) |
| 175 | + |
| 176 | + def test_preprocess_with_mixed_types(self): |
| 177 | + """Test preprocessing with mixed data types.""" |
| 178 | + # Create dataset with mixed types |
| 179 | + X_train = pd.DataFrame( |
| 180 | + { |
| 181 | + "numeric1": np.random.rand(100), |
| 182 | + "numeric2": np.random.randint(0, 100, 100), |
| 183 | + "categorical": np.random.choice(["cat", "dog", "bird"], 100), |
| 184 | + "boolean": np.random.choice([True, False], 100), |
| 185 | + } |
| 186 | + ) |
| 187 | + y_train = pd.Series(np.random.randint(0, 2, 100)) |
| 188 | + |
| 189 | + X_test = pd.DataFrame( |
| 190 | + { |
| 191 | + "numeric1": np.random.rand(10), |
| 192 | + "numeric2": np.random.randint(0, 100, 10), |
| 193 | + "categorical": np.random.choice(["cat", "dog", "bird"], 10), |
| 194 | + "boolean": np.random.choice([True, False], 10), |
| 195 | + } |
| 196 | + ) |
| 197 | + |
| 198 | + # Train AutoML |
| 199 | + automl = AutoML() |
| 200 | + automl_settings = { |
| 201 | + "max_iter": 5, |
| 202 | + "task": "classification", |
| 203 | + "metric": "accuracy", |
| 204 | + "estimator_list": ["lgbm"], |
| 205 | + "verbose": 0, |
| 206 | + } |
| 207 | + automl.fit(X_train, y_train, **automl_settings) |
| 208 | + |
| 209 | + # Test preprocessing |
| 210 | + X_preprocessed = automl.preprocess(X_test) |
| 211 | + |
| 212 | + # Verify the output |
| 213 | + self.assertIsNotNone(X_preprocessed) |
| 214 | + |
| 215 | + def test_estimator_preprocess_without_automl(self): |
| 216 | + """Test that estimator.preprocess() can be used independently.""" |
| 217 | + from flaml.automl.model import LGBMEstimator |
| 218 | + |
| 219 | + # Create a simple estimator |
| 220 | + X_train = np.random.rand(100, 5) |
| 221 | + y_train = np.random.randint(0, 2, 100) |
| 222 | + |
| 223 | + estimator = LGBMEstimator(task="classification") |
| 224 | + estimator.fit(X_train, y_train) |
| 225 | + |
| 226 | + # Test preprocessing |
| 227 | + X_test = np.random.rand(10, 5) |
| 228 | + X_preprocessed = estimator.preprocess(X_test) |
| 229 | + |
| 230 | + # Verify the output |
| 231 | + self.assertIsNotNone(X_preprocessed) |
| 232 | + self.assertEqual(X_preprocessed.shape, X_test.shape) |
| 233 | + |
| 234 | + |
| 235 | +if __name__ == "__main__": |
| 236 | + unittest.main() |
0 commit comments