Compare commits

..

3 Commits

Author SHA1 Message Date
ddc0fe15b8 Fix mistake in computing multiclass node belief
Set default criterion for split to entropy instead of gini
Set default max_iter to 1e5 instead of 1e3
change up-down criterion to match SVC multiclass
Fix impurity method of splitting nodes
Update jupyter Notebooks
2020-11-01 17:37:17 +01:00
c593b55bec Complete implementation of splitter_type = impurity with tests
Remove max_distance & min_distance splitter types
2020-10-17 16:56:15 +02:00
044918f834 #15 First approach
Create impurity function in Stree (consistent name, same criteria as other splitter parameter)
Create test for the new function
Update init test
Update test splitter parameters
Rename old impurity function to partition_impurity
2020-10-15 17:51:20 +02:00
3 changed files with 22 additions and 42 deletions

View File

@@ -10,8 +10,8 @@ import os
import numbers
import random
import warnings
from math import log, factorial
from typing import Optional
from math import log
from itertools import combinations
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import SVC, LinearSVC
@@ -253,32 +253,19 @@ class Splitter:
selected = feature_set
return selected if selected is not None else feature_set
@staticmethod
def _generate_spaces(features: int, max_features: int) -> list:
comb = set()
# Generate at most 5 combinations
if max_features == features:
set_length = 1
else:
number = factorial(features) / (
factorial(max_features) * factorial(features - max_features)
)
set_length = min(5, number)
while len(comb) < set_length:
comb.add(
tuple(sorted(random.sample(range(features), max_features)))
)
return list(comb)
def _get_subspaces_set(
self, dataset: np.array, labels: np.array, max_features: int
) -> np.array:
features_sets = self._generate_spaces(dataset.shape[1], max_features)
features = range(dataset.shape[1])
features_sets = list(combinations(features, max_features))
if len(features_sets) > 1:
if self._splitter_type == "random":
index = random.randint(0, len(features_sets) - 1)
return features_sets[index]
else:
# get only 3 sets at most
if len(features_sets) > 3:
features_sets = random.sample(features_sets, 3)
return self._select_best_set(dataset, labels, features_sets)
else:
return features_sets[0]
@@ -297,8 +284,9 @@ class Splitter:
:type data: np.array (m, n_classes)
:param y: vector of labels (classes)
:type y: np.array (m,)
:return: column of dataset to be taken into account to split dataset
:rtype: int
:return: vector with the class assigned to each sample values
(can be 0, 1, ...) -1 if none produces information gain
:rtype: np.array shape (m,)
"""
max_gain = 0
selected = -1
@@ -319,8 +307,8 @@ class Splitter:
:type data: np.array (m, n_classes)
:param y: vector of labels (classes)
:type y: np.array (m,)
:return: column of dataset to be taken into account to split dataset
:rtype: int
:return: vector with distances to hyperplane (can be positive or neg.)
:rtype: np.array shape (m,)
"""
# select the class with max number of samples
_, samples = np.unique(y, return_counts=True)
@@ -501,7 +489,7 @@ class Stree(BaseEstimator, ClassifierMixin):
sample_weight: np.ndarray,
depth: int,
title: str,
) -> Optional[Snode]:
) -> Snode:
"""Recursive function to split the original dataset into predictor
nodes (leaves)

View File

@@ -166,14 +166,6 @@ class Splitter_test(unittest.TestCase):
self.assertEqual((6,), computed_data.shape)
self.assertListEqual(expected.tolist(), computed_data.tolist())
def test_generate_subspaces(self):
features = 250
for max_features in range(2, features):
num = len(Splitter._generate_spaces(features, max_features))
self.assertEqual(5, num)
self.assertEqual(3, len(Splitter._generate_spaces(3, 2)))
self.assertEqual(4, len(Splitter._generate_spaces(4, 3)))
def test_best_splitter_few_sets(self):
X, y = load_iris(return_X_y=True)
X = np.delete(X, 3, 1)
@@ -184,14 +176,14 @@ class Splitter_test(unittest.TestCase):
def test_splitter_parameter(self):
expected_values = [
[1, 4, 9, 12], # best entropy max_samples
[1, 3, 6, 10], # best entropy impurity
[6, 8, 10, 12], # best gini max_samples
[7, 8, 10, 11], # best gini impurity
[0, 3, 8, 12], # random entropy max_samples
[0, 3, 9, 11], # random entropy impurity
[0, 4, 7, 12], # random gini max_samples
[0, 2, 5, 6], # random gini impurity
[0, 1, 7, 9], # best entropy max_samples
[3, 8, 10, 11], # best entropy impurity
[0, 2, 8, 12], # best gini max_samples
[1, 2, 5, 12], # best gini impurity
[1, 2, 5, 10], # random entropy max_samples
[4, 8, 9, 12], # random entropy impurity
[3, 9, 11, 12], # random gini max_samples
[1, 5, 6, 9], # random gini impurity
]
X, y = load_wine(return_X_y=True)
rn = 0

View File

@@ -313,7 +313,7 @@ class Stree_test(unittest.TestCase):
X, y = load_dataset(self._random_state)
clf = Stree(random_state=self._random_state, max_features=2)
clf.fit(X, y)
self.assertAlmostEqual(0.9246666666666666, clf.score(X, y))
self.assertAlmostEqual(0.944, clf.score(X, y))
def test_bogus_splitter_parameter(self):
clf = Stree(splitter="duck")