add max_features working

This commit is contained in:
Ricardo Montañana Gómez 2020-06-30 17:13:07 +02:00
parent 98a28cd271
commit 19543b48fa
Signed by: rmontanana
GPG Key ID: 46064262FD9A7ADE
2 changed files with 53 additions and 28 deletions

View File

@ -53,8 +53,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
def _initialize_random(self) -> np.random.mtrand.RandomState:
if self.random_state is None:
return np.random.mtrand._rand
else:
return np.random.RandomState(self.random_state)
return np.random.RandomState(self.random_state)
@staticmethod
def _initialize_sample_weight(
@ -62,8 +61,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
) -> np.array:
if sample_weight is None:
return np.ones((n_samples,), dtype=np.float64)
else:
return sample_weight.copy()
return sample_weight.copy()
def _validate_estimator(self):
"""Check the estimator and set the base_estimator_ attribute."""
@ -77,10 +75,9 @@ class Odte(BaseEnsemble, ClassifierMixin):
# Check parameters are Ok.
if self.n_estimators < 3:
raise ValueError(
f"n_estimators must be greater than 3... got (n_estimators=\
{self.n_estimators:f})"
f"n_estimators must be greater than 2 but got (n_estimators=\
{self.n_estimators})"
)
# the rest of parameters are checked in estimator
check_classification_targets(y)
X, y = check_X_y(X, y)
sample_weight = _check_sample_weight(
@ -90,12 +87,13 @@ class Odte(BaseEnsemble, ClassifierMixin):
# Initialize computed parameters
# Build the estimator
self.n_features_in_ = X.shape[1]
self.n_features = X.shape[1]
self.n_features_ = X.shape[1]
self.max_features_ = self._initialize_max_features()
self._validate_estimator()
self.classes_, y = np.unique(y, return_inverse=True)
self.n_classes_ = self.classes_.shape[0]
self.estimators_ = []
self.subspaces_ = []
self._train(X, y, sample_weight)
return self
@ -109,16 +107,19 @@ class Odte(BaseEnsemble, ClassifierMixin):
for _ in range(self.n_estimators):
# Build clf
clf = clone(self.base_estimator_)
# clf.set_params(**self.estimator_params)
self.estimators_.append(clf)
# bootstrap
indices = random_box.randint(0, n_samples, boot_samples)
# update weights with the chosen samples
weights_update = np.bincount(indices, minlength=n_samples)
features = self.get_subspace(X, y)
features = self._get_random_subspace(X, y)
self.subspaces_.append(features)
current_weights = weights * weights_update
# train the classifier
clf.fit(X[indices, features], y[indices], current_weights[indices])
bootstrap = X[indices, :]
clf.fit(
bootstrap[:, features], y[indices], current_weights[indices]
)
def _get_bootstrap_n_samples(self, n_samples) -> int:
if self.max_samples is None:
@ -157,7 +158,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
elif self.max_features is None:
max_features = self.n_features_
elif isinstance(self.max_features, int):
max_features = self.max_features
max_features = abs(self.max_features)
else: # float
if self.max_features > 0.0:
max_features = max(
@ -171,7 +172,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
)
return max_features
def _get_subspaces_set(
def _get_random_subspace(
self, dataset: np.array, labels: np.array
) -> np.array:
features = range(dataset.shape[1])
@ -182,12 +183,6 @@ class Odte(BaseEnsemble, ClassifierMixin):
else:
return features_sets[0]
def get_subspace(self, dataset: np.array, labels: np.array) -> list:
"""Return the best subspace to build a tree
"""
indices = self._get_subspaces_set(dataset, labels)
return dataset[:, indices], indices
def predict(self, X: np.array) -> np.array:
proba = self.predict_proba(X)
return self.classes_.take((np.argmax(proba, axis=1)), axis=0)
@ -196,10 +191,15 @@ class Odte(BaseEnsemble, ClassifierMixin):
check_is_fitted(self, ["estimators_"])
# Input validation
X = check_array(X)
for tree in self.estimators_:
if self.n_features_ != X.shape[1]:
raise ValueError("Number of features of the model must "
"match the input. Model n_features is {0} and "
"input n_features is {1}."
"".format(self.n_features_, X.shape[1]))
for tree, features in zip(self.estimators_, self.subspaces_):
n_samples = X.shape[0]
result = np.zeros((n_samples, self.n_classes_))
predictions = tree.predict(X)
predictions = tree.predict(X[:, features])
for i in range(n_samples):
result[i, predictions[i]] += 1
return result
@ -207,8 +207,6 @@ class Odte(BaseEnsemble, ClassifierMixin):
def score(
self, X: np.array, y: np.array, sample_weight: np.array = None
) -> float:
# todo
check_is_fitted(self, ["estimators_"])
check_classification_targets(y)
X, y = check_X_y(X, y)
y_pred = self.predict(X).reshape(y.shape)

View File

@ -37,6 +37,28 @@ class Odte_test(unittest.TestCase):
computed = tclf._initialize_sample_weight(value, m)
self.assertListEqual(expected.tolist(), computed.tolist())
def test_initialize_max_feature(self):
expected_values = [
[0, 4, 10, 11],
[0, 2, 3, 5, 14, 15],
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
[0, 4, 10, 11],
[0, 4, 10, 11],
[0, 4, 10, 11],
]
X, y = load_dataset(
random_state=self._random_state, n_features=16, n_samples=10
)
for max_features in [4, 0.4, 1.0, None, "auto", "sqrt", "log2"]:
tclf = Odte(
random_state=self._random_state, max_features=max_features
)
tclf.fit(X, y)
computed = tclf._get_random_subspace(X, y)
expected = expected_values.pop(0)
self.assertListEqual(expected, list(computed))
def test_initialize_random(self):
expected = [37, 235, 908]
tclf = Odte(random_state=self._random_state)
@ -51,6 +73,13 @@ class Odte_test(unittest.TestCase):
self.assertGreaterEqual(value, 101)
self.assertLessEqual(value, 1000)
def test_bogus_max_features(self):
values = ["duck", -0.1, 0.0]
for max_features in values:
with self.assertRaises(ValueError):
tclf = Odte(max_features=max_features)
tclf.fit(*load_dataset(self._random_state))
def test_bogus_n_estimator(self):
values = [0, -1, 2]
for n_estimators in values:
@ -79,9 +108,7 @@ class Odte_test(unittest.TestCase):
X, y = load_dataset(self._random_state)
expected = y
tclf = Odte(
random_state=self._random_state,
max_features=None,
max_samples=0.1,
random_state=self._random_state, max_features=1.0, max_samples=0.1,
)
tclf.set_params(**dict(base_estimator__kernel="linear",))
computed = tclf.fit(X, y).predict(X)
@ -101,8 +128,8 @@ class Odte_test(unittest.TestCase):
def test_score_splitter_max_features(self):
X, y = load_dataset(self._random_state, n_features=12, n_samples=150)
results = [
0.9866666666666667,
0.9866666666666667,
0.6466666666666666,
0.6466666666666666,
0.9866666666666667,
0.9866666666666667,
]