New predict proba (#53)

* Add complete classes counts to node and tests

* Implement optimized predict and new predict_proba

* Add predict_proba test

* Add python 3.10 to CI
This commit is contained in:
Ricardo Montañana Gómez
2022-10-21 12:26:46 +02:00
committed by GitHub
parent 93be8a89a8
commit 2f6ae648a1
5 changed files with 120 additions and 79 deletions

View File

@@ -13,7 +13,7 @@ jobs:
strategy: strategy:
matrix: matrix:
os: [macos-latest, ubuntu-latest, windows-latest] os: [macos-latest, ubuntu-latest, windows-latest]
python: [3.8] python: [3.8, "3.10"]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2

View File

@@ -68,6 +68,7 @@ class Snode:
self._impurity = impurity self._impurity = impurity
self._partition_column: int = -1 self._partition_column: int = -1
self._scaler = scaler self._scaler = scaler
self._proba = None
@classmethod @classmethod
def copy(cls, node: "Snode") -> "Snode": def copy(cls, node: "Snode") -> "Snode":
@@ -127,23 +128,22 @@ class Snode:
def get_up(self) -> "Snode": def get_up(self) -> "Snode":
return self._up return self._up
def make_predictor(self): def make_predictor(self, num_classes: int) -> None:
"""Compute the class of the predictor and its belief based on the """Compute the class of the predictor and its belief based on the
subdataset of the node only if it is a leaf subdataset of the node only if it is a leaf
""" """
if not self.is_leaf(): if not self.is_leaf():
return return
classes, card = np.unique(self._y, return_counts=True) classes, card = np.unique(self._y, return_counts=True)
if len(classes) > 1: self._proba = np.zeros((num_classes,), dtype=np.int64)
for c, n in zip(classes, card):
self._proba[c] = n
try:
max_card = max(card) max_card = max(card)
self._class = classes[card == max_card][0] self._class = classes[card == max_card][0]
self._belief = max_card / np.sum(card) self._belief = max_card / np.sum(card)
else: except ValueError:
self._belief = 1 self._class = None
try:
self._class = classes[0]
except IndexError:
self._class = None
def graph(self): def graph(self):
""" """
@@ -155,7 +155,7 @@ class Snode:
output += ( output += (
f'N{id(self)} [shape=box style=filled label="' f'N{id(self)} [shape=box style=filled label="'
f"class={self._class} impurity={self._impurity:.3f} " f"class={self._class} impurity={self._impurity:.3f} "
f'classes={count_values[0]} samples={count_values[1]}"];\n' f'counts={self._proba}"];\n'
) )
else: else:
output += ( output += (

View File

@@ -314,7 +314,7 @@ class Stree(BaseEstimator, ClassifierMixin):
if np.unique(y).shape[0] == 1: if np.unique(y).shape[0] == 1:
# only 1 class => pure dataset # only 1 class => pure dataset
node.set_title(title + ", <pure>") node.set_title(title + ", <pure>")
node.make_predictor() node.make_predictor(self.n_classes_)
return node return node
# Train the model # Train the model
clf = self._build_clf() clf = self._build_clf()
@@ -333,7 +333,7 @@ class Stree(BaseEstimator, ClassifierMixin):
if X_U is None or X_D is None: if X_U is None or X_D is None:
# didn't part anything # didn't part anything
node.set_title(title + ", <cgaf>") node.set_title(title + ", <cgaf>")
node.make_predictor() node.make_predictor(self.n_classes_)
return node return node
node.set_up( node.set_up(
self._train(X_U, y_u, sw_u, depth + 1, title + f" - Up({depth+1})") self._train(X_U, y_u, sw_u, depth + 1, title + f" - Up({depth+1})")
@@ -367,28 +367,66 @@ class Stree(BaseEstimator, ClassifierMixin):
) )
) )
@staticmethod def __predict_class(self, X: np.array) -> np.array:
def _reorder_results(y: np.array, indices: np.array) -> np.array: def compute_prediction(xp, indices, node):
"""Reorder an array based on the array of indices passed if xp is None:
return
if node.is_leaf():
# set a class for indices
result[indices] = node._proba
return
self.splitter_.partition(xp, node, train=False)
x_u, x_d = self.splitter_.part(xp)
i_u, i_d = self.splitter_.part(indices)
compute_prediction(x_u, i_u, node.get_up())
compute_prediction(x_d, i_d, node.get_down())
# setup prediction & make it happen
result = np.zeros((X.shape[0], self.n_classes_))
indices = np.arange(X.shape[0])
compute_prediction(X, indices, self.tree_)
return result
def check_predict(self, X) -> np.array:
check_is_fitted(self, ["tree_"])
# Input validation
X = check_array(X)
if X.shape[1] != self.n_features_:
raise ValueError(
f"Expected {self.n_features_} features but got "
f"({X.shape[1]})"
)
return X
def predict_proba(self, X: np.array) -> np.array:
"""Predict class probabilities of the input samples X.
The predicted class probability is the fraction of samples of the same
class in a leaf.
Parameters Parameters
---------- ----------
y : np.array X : dataset of samples.
data untidy
indices : np.array
indices used to set order
Returns Returns
------- -------
np.array proba : array of shape (n_samples, n_classes)
array y ordered The class probabilities of the input samples.
Raises
------
ValueError
if dataset with inconsistent number of features
NotFittedError
if model is not fitted
""" """
# return array of same type given in y
y_ordered = y.copy() X = self.check_predict(X)
indices = indices.astype(int) # return # of samples of each class in leaf node
for i, index in enumerate(indices): values = self.__predict_class(X)
y_ordered[index] = y[i] normalizer = values.sum(axis=1)[:, np.newaxis]
return y_ordered normalizer[normalizer == 0.0] = 1.0
return values / normalizer
def predict(self, X: np.array) -> np.array: def predict(self, X: np.array) -> np.array:
"""Predict labels for each sample in dataset passed """Predict labels for each sample in dataset passed
@@ -410,40 +448,8 @@ class Stree(BaseEstimator, ClassifierMixin):
NotFittedError NotFittedError
if model is not fitted if model is not fitted
""" """
X = self.check_predict(X)
def predict_class( return self.classes_[np.argmax(self.__predict_class(X), axis=1)]
xp: np.array, indices: np.array, node: Snode
) -> np.array:
if xp is None:
return [], []
if node.is_leaf():
# set a class for every sample in dataset
prediction = np.full((xp.shape[0], 1), node._class)
return prediction, indices
self.splitter_.partition(xp, node, train=False)
x_u, x_d = self.splitter_.part(xp)
i_u, i_d = self.splitter_.part(indices)
prx_u, prin_u = predict_class(x_u, i_u, node.get_up())
prx_d, prin_d = predict_class(x_d, i_d, node.get_down())
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
# sklearn check
check_is_fitted(self, ["tree_"])
# Input validation
X = check_array(X)
if X.shape[1] != self.n_features_:
raise ValueError(
f"Expected {self.n_features_} features but got "
f"({X.shape[1]})"
)
# setup prediction & make it happen
indices = np.arange(X.shape[0])
result = (
self._reorder_results(*predict_class(X, indices, self.tree_))
.astype(int)
.ravel()
)
return self.classes_[result]
def nodes_leaves(self) -> tuple: def nodes_leaves(self) -> tuple:
"""Compute the number of nodes and leaves in the built tree """Compute the number of nodes and leaves in the built tree

View File

@@ -67,10 +67,28 @@ class Snode_test(unittest.TestCase):
def test_make_predictor_on_leaf(self): def test_make_predictor_on_leaf(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test") test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
test.make_predictor() test.make_predictor(2)
self.assertEqual(1, test._class) self.assertEqual(1, test._class)
self.assertEqual(0.75, test._belief) self.assertEqual(0.75, test._belief)
self.assertEqual(-1, test._partition_column) self.assertEqual(-1, test._partition_column)
self.assertListEqual([1, 3], test._proba.tolist())
def test_make_predictor_on_not_leaf(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
test.set_up(Snode(None, [1], [1], [], 0.0, "another_test"))
test.make_predictor(2)
self.assertIsNone(test._class)
self.assertEqual(0, test._belief)
self.assertEqual(-1, test._partition_column)
self.assertEqual(-1, test.get_up()._partition_column)
self.assertIsNone(test._proba)
def test_make_predictor_on_leaf_bogus_data(self):
test = Snode(None, [1, 2, 3, 4], [], [], 0.0, "test")
test.make_predictor(2)
self.assertIsNone(test._class)
self.assertEqual(-1, test._partition_column)
self.assertListEqual([0, 0], test._proba.tolist())
def test_set_title(self): def test_set_title(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test") test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
@@ -97,21 +115,6 @@ class Snode_test(unittest.TestCase):
test.set_features([1, 2]) test.set_features([1, 2])
self.assertListEqual([1, 2], test.get_features()) self.assertListEqual([1, 2], test.get_features())
def test_make_predictor_on_not_leaf(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
test.set_up(Snode(None, [1], [1], [], 0.0, "another_test"))
test.make_predictor()
self.assertIsNone(test._class)
self.assertEqual(0, test._belief)
self.assertEqual(-1, test._partition_column)
self.assertEqual(-1, test.get_up()._partition_column)
def test_make_predictor_on_leaf_bogus_data(self):
test = Snode(None, [1, 2, 3, 4], [], [], 0.0, "test")
test.make_predictor()
self.assertIsNone(test._class)
self.assertEqual(-1, test._partition_column)
def test_copy_node(self): def test_copy_node(self):
px = [1, 2, 3, 4] px = [1, 2, 3, 4]
py = [1] py = [1]

View File

@@ -115,6 +115,38 @@ class Stree_test(unittest.TestCase):
yp = clf.fit(X, y).predict(X[:num, :]) yp = clf.fit(X, y).predict(X[:num, :])
self.assertListEqual(y[:num].tolist(), yp.tolist()) self.assertListEqual(y[:num].tolist(), yp.tolist())
def test_multiple_predict_proba(self):
expected = {
"liblinear": {
0: [0.02401129943502825, 0.9759887005649718],
17: [0.9282970550576184, 0.07170294494238157],
},
"linear": {
0: [0.029329608938547486, 0.9706703910614525],
17: [0.9298469387755102, 0.07015306122448979],
},
"rbf": {
0: [0.023448275862068966, 0.976551724137931],
17: [0.9458064516129032, 0.05419354838709677],
},
"poly": {
0: [0.01601164483260553, 0.9839883551673945],
17: [0.9089790897908979, 0.0910209102091021],
},
}
indices = [0, 17]
X, y = load_dataset(self._random_state)
for kernel in ["liblinear", "linear", "rbf", "poly"]:
clf = Stree(
kernel=kernel,
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
random_state=self._random_state,
)
yp = clf.fit(X, y).predict_proba(X)
for index in indices:
for exp, comp in zip(expected[kernel][index], yp[index]):
self.assertAlmostEqual(exp, comp)
def test_single_vs_multiple_prediction(self): def test_single_vs_multiple_prediction(self):
"""Check if predicting sample by sample gives the same result as """Check if predicting sample by sample gives the same result as
predicting all samples at once predicting all samples at once
@@ -695,7 +727,7 @@ class Stree_test(unittest.TestCase):
) )
expected_tail = ( expected_tail = (
' [shape=box style=filled label="class=1 impurity=0.000 ' ' [shape=box style=filled label="class=1 impurity=0.000 '
'classes=[1] samples=[1]"];\n}\n' 'counts=[0 1 0]"];\n}\n'
) )
self.assertEqual(clf.graph(), expected_head + "}\n") self.assertEqual(clf.graph(), expected_head + "}\n")
clf.fit(X, y) clf.fit(X, y)
@@ -715,7 +747,7 @@ class Stree_test(unittest.TestCase):
) )
expected_tail = ( expected_tail = (
' [shape=box style=filled label="class=1 impurity=0.000 ' ' [shape=box style=filled label="class=1 impurity=0.000 '
'classes=[1] samples=[1]"];\n}\n' 'counts=[0 1 0]"];\n}\n'
) )
self.assertEqual(clf.graph("Sample title"), expected_head + "}\n") self.assertEqual(clf.graph("Sample title"), expected_head + "}\n")
clf.fit(X, y) clf.fit(X, y)