diff --git a/odte/Odte.py b/odte/Odte.py index 989f6c5..02b6fb7 100644 --- a/odte/Odte.py +++ b/odte/Odte.py @@ -53,8 +53,7 @@ class Odte(BaseEnsemble, ClassifierMixin): def _initialize_random(self) -> np.random.mtrand.RandomState: if self.random_state is None: return np.random.mtrand._rand - else: - return np.random.RandomState(self.random_state) + return np.random.RandomState(self.random_state) @staticmethod def _initialize_sample_weight( @@ -62,8 +61,7 @@ class Odte(BaseEnsemble, ClassifierMixin): ) -> np.array: if sample_weight is None: return np.ones((n_samples,), dtype=np.float64) - else: - return sample_weight.copy() + return sample_weight.copy() def _validate_estimator(self): """Check the estimator and set the base_estimator_ attribute.""" @@ -77,10 +75,9 @@ class Odte(BaseEnsemble, ClassifierMixin): # Check parameters are Ok. if self.n_estimators < 3: raise ValueError( - f"n_estimators must be greater than 3... got (n_estimators=\ - {self.n_estimators:f})" + f"n_estimators must be greater than 2 but got (n_estimators=\ + {self.n_estimators})" ) - # the rest of parameters are checked in estimator check_classification_targets(y) X, y = check_X_y(X, y) sample_weight = _check_sample_weight( @@ -90,12 +87,13 @@ class Odte(BaseEnsemble, ClassifierMixin): # Initialize computed parameters # Build the estimator self.n_features_in_ = X.shape[1] - self.n_features = X.shape[1] + self.n_features_ = X.shape[1] self.max_features_ = self._initialize_max_features() self._validate_estimator() self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = self.classes_.shape[0] self.estimators_ = [] + self.subspaces_ = [] self._train(X, y, sample_weight) return self @@ -109,16 +107,19 @@ class Odte(BaseEnsemble, ClassifierMixin): for _ in range(self.n_estimators): # Build clf clf = clone(self.base_estimator_) - # clf.set_params(**self.estimator_params) self.estimators_.append(clf) # bootstrap indices = random_box.randint(0, n_samples, boot_samples) # update weights with the chosen samples weights_update = np.bincount(indices, minlength=n_samples) - features = self.get_subspace(X, y) + features = self._get_random_subspace(X, y) + self.subspaces_.append(features) current_weights = weights * weights_update # train the classifier - clf.fit(X[indices, features], y[indices], current_weights[indices]) + bootstrap = X[indices, :] + clf.fit( + bootstrap[:, features], y[indices], current_weights[indices] + ) def _get_bootstrap_n_samples(self, n_samples) -> int: if self.max_samples is None: @@ -157,7 +158,7 @@ class Odte(BaseEnsemble, ClassifierMixin): elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, int): - max_features = self.max_features + max_features = abs(self.max_features) else: # float if self.max_features > 0.0: max_features = max( @@ -171,7 +172,7 @@ class Odte(BaseEnsemble, ClassifierMixin): ) return max_features - def _get_subspaces_set( + def _get_random_subspace( self, dataset: np.array, labels: np.array ) -> np.array: features = range(dataset.shape[1]) @@ -182,12 +183,6 @@ class Odte(BaseEnsemble, ClassifierMixin): else: return features_sets[0] - def get_subspace(self, dataset: np.array, labels: np.array) -> list: - """Return the best subspace to build a tree - """ - indices = self._get_subspaces_set(dataset, labels) - return dataset[:, indices], indices - def predict(self, X: np.array) -> np.array: proba = self.predict_proba(X) return self.classes_.take((np.argmax(proba, axis=1)), axis=0) @@ -196,10 +191,15 @@ class Odte(BaseEnsemble, ClassifierMixin): check_is_fitted(self, ["estimators_"]) # Input validation X = check_array(X) - for tree in self.estimators_: + if self.n_features_ != X.shape[1]: + raise ValueError("Number of features of the model must " + "match the input. Model n_features is {0} and " + "input n_features is {1}." + "".format(self.n_features_, X.shape[1])) + for tree, features in zip(self.estimators_, self.subspaces_): n_samples = X.shape[0] result = np.zeros((n_samples, self.n_classes_)) - predictions = tree.predict(X) + predictions = tree.predict(X[:, features]) for i in range(n_samples): result[i, predictions[i]] += 1 return result @@ -207,8 +207,6 @@ class Odte(BaseEnsemble, ClassifierMixin): def score( self, X: np.array, y: np.array, sample_weight: np.array = None ) -> float: - # todo - check_is_fitted(self, ["estimators_"]) check_classification_targets(y) X, y = check_X_y(X, y) y_pred = self.predict(X).reshape(y.shape) diff --git a/odte/tests/Odte_tests.py b/odte/tests/Odte_tests.py index 6ee3f74..210a457 100644 --- a/odte/tests/Odte_tests.py +++ b/odte/tests/Odte_tests.py @@ -37,6 +37,28 @@ class Odte_test(unittest.TestCase): computed = tclf._initialize_sample_weight(value, m) self.assertListEqual(expected.tolist(), computed.tolist()) + def test_initialize_max_feature(self): + expected_values = [ + [0, 4, 10, 11], + [0, 2, 3, 5, 14, 15], + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [0, 4, 10, 11], + [0, 4, 10, 11], + [0, 4, 10, 11], + ] + X, y = load_dataset( + random_state=self._random_state, n_features=16, n_samples=10 + ) + for max_features in [4, 0.4, 1.0, None, "auto", "sqrt", "log2"]: + tclf = Odte( + random_state=self._random_state, max_features=max_features + ) + tclf.fit(X, y) + computed = tclf._get_random_subspace(X, y) + expected = expected_values.pop(0) + self.assertListEqual(expected, list(computed)) + def test_initialize_random(self): expected = [37, 235, 908] tclf = Odte(random_state=self._random_state) @@ -51,6 +73,13 @@ class Odte_test(unittest.TestCase): self.assertGreaterEqual(value, 101) self.assertLessEqual(value, 1000) + def test_bogus_max_features(self): + values = ["duck", -0.1, 0.0] + for max_features in values: + with self.assertRaises(ValueError): + tclf = Odte(max_features=max_features) + tclf.fit(*load_dataset(self._random_state)) + def test_bogus_n_estimator(self): values = [0, -1, 2] for n_estimators in values: @@ -79,9 +108,7 @@ class Odte_test(unittest.TestCase): X, y = load_dataset(self._random_state) expected = y tclf = Odte( - random_state=self._random_state, - max_features=None, - max_samples=0.1, + random_state=self._random_state, max_features=1.0, max_samples=0.1, ) tclf.set_params(**dict(base_estimator__kernel="linear",)) computed = tclf.fit(X, y).predict(X) @@ -101,8 +128,8 @@ class Odte_test(unittest.TestCase): def test_score_splitter_max_features(self): X, y = load_dataset(self._random_state, n_features=12, n_samples=150) results = [ - 0.9866666666666667, - 0.9866666666666667, + 0.6466666666666666, + 0.6466666666666666, 0.9866666666666667, 0.9866666666666667, ]