add max_features working

2025-07-11 08:12:06 +00:00 · 2020-06-30 17:13:07 +02:00 · 2020-06-30 17:13:07 +02:00 · 19543b48fa
commit 19543b48fa
parent 98a28cd271
2 changed files with 53 additions and 28 deletions
--- a/odte/Odte.py
+++ b/odte/Odte.py
@ -53,8 +53,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
    def _initialize_random(self) -> np.random.mtrand.RandomState:
        if self.random_state is None:
            return np.random.mtrand._rand
-        else:
-            return np.random.RandomState(self.random_state)
+        return np.random.RandomState(self.random_state)

    @staticmethod
    def _initialize_sample_weight(
@ -62,8 +61,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
    ) -> np.array:
        if sample_weight is None:
            return np.ones((n_samples,), dtype=np.float64)
-        else:
-            return sample_weight.copy()
+        return sample_weight.copy()

    def _validate_estimator(self):
        """Check the estimator and set the base_estimator_ attribute."""
@ -77,10 +75,9 @@ class Odte(BaseEnsemble, ClassifierMixin):
        # Check parameters are Ok.
        if self.n_estimators < 3:
            raise ValueError(
-                f"n_estimators must be greater than 3... got (n_estimators=\
-                    {self.n_estimators:f})"
+                f"n_estimators must be greater than 2 but got (n_estimators=\
+                    {self.n_estimators})"
            )
-        # the rest of parameters are checked in estimator
        check_classification_targets(y)
        X, y = check_X_y(X, y)
        sample_weight = _check_sample_weight(
@ -90,12 +87,13 @@ class Odte(BaseEnsemble, ClassifierMixin):
        # Initialize computed parameters
        #  Build the estimator
        self.n_features_in_ = X.shape[1]
-        self.n_features = X.shape[1]
+        self.n_features_ = X.shape[1]
        self.max_features_ = self._initialize_max_features()
        self._validate_estimator()
        self.classes_, y = np.unique(y, return_inverse=True)
        self.n_classes_ = self.classes_.shape[0]
        self.estimators_ = []
+        self.subspaces_ = []
        self._train(X, y, sample_weight)
        return self

@ -109,16 +107,19 @@ class Odte(BaseEnsemble, ClassifierMixin):
        for _ in range(self.n_estimators):
            # Build clf
            clf = clone(self.base_estimator_)
-            # clf.set_params(**self.estimator_params)
            self.estimators_.append(clf)
            # bootstrap
            indices = random_box.randint(0, n_samples, boot_samples)
            # update weights with the chosen samples
            weights_update = np.bincount(indices, minlength=n_samples)
-            features = self.get_subspace(X, y)
+            features = self._get_random_subspace(X, y)
+            self.subspaces_.append(features)
            current_weights = weights * weights_update
            # train the classifier
-            clf.fit(X[indices, features], y[indices], current_weights[indices])
+            bootstrap = X[indices, :]
+            clf.fit(
+                bootstrap[:, features], y[indices], current_weights[indices]
+            )

    def _get_bootstrap_n_samples(self, n_samples) -> int:
        if self.max_samples is None:
@ -157,7 +158,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, int):
-            max_features = self.max_features
+            max_features = abs(self.max_features)
        else:  # float
            if self.max_features > 0.0:
                max_features = max(
@ -171,7 +172,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
                )
        return max_features

-    def _get_subspaces_set(
+    def _get_random_subspace(
        self, dataset: np.array, labels: np.array
    ) -> np.array:
        features = range(dataset.shape[1])
@ -182,12 +183,6 @@ class Odte(BaseEnsemble, ClassifierMixin):
        else:
            return features_sets[0]

-    def get_subspace(self, dataset: np.array, labels: np.array) -> list:
-        """Return the best subspace to build a tree
-        """
-        indices = self._get_subspaces_set(dataset, labels)
-        return dataset[:, indices], indices
-
    def predict(self, X: np.array) -> np.array:
        proba = self.predict_proba(X)
        return self.classes_.take((np.argmax(proba, axis=1)), axis=0)
@ -196,10 +191,15 @@ class Odte(BaseEnsemble, ClassifierMixin):
        check_is_fitted(self, ["estimators_"])
        # Input validation
        X = check_array(X)
-        for tree in self.estimators_:
+        if self.n_features_ != X.shape[1]:
+            raise ValueError("Number of features of the model must "
+                             "match the input. Model n_features is {0} and "
+                             "input n_features is {1}."
+                             "".format(self.n_features_, X.shape[1]))
+        for tree, features in zip(self.estimators_, self.subspaces_):
            n_samples = X.shape[0]
            result = np.zeros((n_samples, self.n_classes_))
-            predictions = tree.predict(X)
+            predictions = tree.predict(X[:, features])
            for i in range(n_samples):
                result[i, predictions[i]] += 1
        return result
@ -207,8 +207,6 @@ class Odte(BaseEnsemble, ClassifierMixin):
    def score(
        self, X: np.array, y: np.array, sample_weight: np.array = None
    ) -> float:
-        # todo
-        check_is_fitted(self, ["estimators_"])
        check_classification_targets(y)
        X, y = check_X_y(X, y)
        y_pred = self.predict(X).reshape(y.shape)
--- a/odte/tests/Odte_tests.py
+++ b/odte/tests/Odte_tests.py
@ -37,6 +37,28 @@ class Odte_test(unittest.TestCase):
            computed = tclf._initialize_sample_weight(value, m)
            self.assertListEqual(expected.tolist(), computed.tolist())

+    def test_initialize_max_feature(self):
+        expected_values = [
+            [0, 4, 10, 11],
+            [0, 2, 3, 5, 14, 15],
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+            [0, 4, 10, 11],
+            [0, 4, 10, 11],
+            [0, 4, 10, 11],
+        ]
+        X, y = load_dataset(
+            random_state=self._random_state, n_features=16, n_samples=10
+        )
+        for max_features in [4, 0.4, 1.0, None, "auto", "sqrt", "log2"]:
+            tclf = Odte(
+                random_state=self._random_state, max_features=max_features
+            )
+            tclf.fit(X, y)
+            computed = tclf._get_random_subspace(X, y)
+            expected = expected_values.pop(0)
+            self.assertListEqual(expected, list(computed))
+
    def test_initialize_random(self):
        expected = [37, 235, 908]
        tclf = Odte(random_state=self._random_state)
@ -51,6 +73,13 @@ class Odte_test(unittest.TestCase):
            self.assertGreaterEqual(value, 101)
            self.assertLessEqual(value, 1000)

+    def test_bogus_max_features(self):
+        values = ["duck", -0.1, 0.0]
+        for max_features in values:
+            with self.assertRaises(ValueError):
+                tclf = Odte(max_features=max_features)
+                tclf.fit(*load_dataset(self._random_state))
+
    def test_bogus_n_estimator(self):
        values = [0, -1, 2]
        for n_estimators in values:
@ -79,9 +108,7 @@ class Odte_test(unittest.TestCase):
        X, y = load_dataset(self._random_state)
        expected = y
        tclf = Odte(
-            random_state=self._random_state,
-            max_features=None,
-            max_samples=0.1,
+            random_state=self._random_state, max_features=1.0, max_samples=0.1,
        )
        tclf.set_params(**dict(base_estimator__kernel="linear",))
        computed = tclf.fit(X, y).predict(X)
@ -101,8 +128,8 @@ class Odte_test(unittest.TestCase):
    def test_score_splitter_max_features(self):
        X, y = load_dataset(self._random_state, n_features=12, n_samples=150)
        results = [
-            0.9866666666666667,
-            0.9866666666666667,
+            0.6466666666666666,
+            0.6466666666666666,
            0.9866666666666667,
            0.9866666666666667,
        ]