Document & lint code

This commit is contained in:
2020-05-30 23:10:10 +02:00
parent 724a4855fb
commit 5e5fea9c6a
4 changed files with 242 additions and 105 deletions

View File

@@ -48,7 +48,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.722% 200\nValid: 83.278% 996\n"
"text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.472% 197\nValid: 83.528% 999\n"
}
],
"source": [
@@ -103,7 +103,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9797\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=1.0 belief=0.984127 counts=(array([0., 1.]), array([ 2, 124]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up\nroot - Up - Down, <cgaf> - Leaf class=0.0 belief=0.750000 counts=(array([0., 1.]), array([3, 1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.980029 counts=(array([0., 1.]), array([687, 14]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9809\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([124]))\nroot - Up, <cgaf> - Leaf class=0.0 belief=0.977560 counts=(array([0., 1.]), array([697, 16]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9869\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([129]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([2]))\nroot - Up, <cgaf> - Leaf class=0.0 belief=0.984419 counts=(array([0., 1.]), array([695, 11]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9869\nClassifier's accuracy (test) : 0.9777\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([129]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([2]))\nroot - Up, <cgaf> - Leaf class=0.0 belief=0.984419 counts=(array([0., 1.]), array([695, 11]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9916\nClassifier's accuracy (test) : 0.9833\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([131]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([8]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.989855 counts=(array([0., 1.]), array([683, 7]))\n\n**************************************************\n0.2235 secs\n"
"text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9737\nClassifier's accuracy (test) : 0.9805\nroot\nroot - Down, <cgaf> - Leaf class=1 belief= 0.945736 counts=(array([0, 1]), array([ 7, 122]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up, <cgaf> - Leaf class=0 belief= 0.978784 counts=(array([0, 1]), array([692, 15]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9809\nClassifier's accuracy (test) : 0.9805\nroot\nroot - Down, <cgaf> - Leaf class=1 belief= 0.983871 counts=(array([0, 1]), array([ 2, 122]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.980170 counts=(array([0, 1]), array([692, 14]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9904\nClassifier's accuracy (test) : 0.9777\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([122]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([8]))\nroot - Up - Up, <cgaf> - Leaf class=0 belief= 0.988669 counts=(array([0, 1]), array([698, 8]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9916\nClassifier's accuracy (test) : 0.9721\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([125]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([5]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.990071 counts=(array([0, 1]), array([698, 7]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9940\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([128]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.992867 counts=(array([0, 1]), array([696, 5]))\n\n**************************************************\n0.2412 secs\n"
}
],
"source": [
@@ -123,7 +123,13 @@
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "[[0.97223657 0.02776343]\n [0.96965421 0.03034579]\n [0.96918057 0.03081943]\n [0.94009975 0.05990025]]\n"
}
],
"source": [
"import numpy as np\n",
"from sklearn.preprocessing import StandardScaler\n",
@@ -133,7 +139,7 @@
"cclf = CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5)\n",
"cclf.fit(Xtrain, ytrain)\n",
"res = cclf.predict_proba(Xtest)\n",
"#an array containing probabilities of belonging to the 1st class"
"print(res[:4, :])"
]
},
{
@@ -144,7 +150,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([131]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([8]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.989855 counts=(array([0., 1.]), array([683, 7]))\n"
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([128]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.992867 counts=(array([0, 1]), array([696, 5]))\n"
}
],
"source": [
@@ -161,7 +167,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([131]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([8]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.989855 counts=(array([0., 1.]), array([683, 7]))\n"
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([128]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.992867 counts=(array([0, 1]), array([696, 5]))\n"
}
],
"source": [
@@ -189,7 +195,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x12aabb320>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x12aab0440>, 'Stree')\n3 functools.partial(<function check_fit_score_takes_y at 0x12aab0320>, 'Stree')\n4 functools.partial(<function check_sample_weights_pandas_series at 0x12aaaac20>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x12aaaad40>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x12aaaae60>, 'Stree')\n7 functools.partial(<function check_sample_weights_shape at 0x12aaaaf80>, 'Stree')\n8 functools.partial(<function check_sample_weights_invariance at 0x12aaac0e0>, 'Stree')\n9 functools.partial(<function check_estimators_fit_returns_self at 0x12aab6440>, 'Stree')\n10 functools.partial(<function check_estimators_fit_returns_self at 0x12aab6440>, 'Stree', readonly_memmap=True)\n11 functools.partial(<function check_complex_data at 0x12aaac290>, 'Stree')\n12 functools.partial(<function check_dtype_object at 0x12aaac200>, 'Stree')\n13 functools.partial(<function check_estimators_empty_data_messages at 0x12aab0560>, 'Stree')\n14 functools.partial(<function check_pipeline_consistency at 0x12aab0200>, 'Stree')\n15 functools.partial(<function check_estimators_nan_inf at 0x12aab0680>, 'Stree')\n16 functools.partial(<function check_estimators_overwrite_params at 0x12aabb200>, 'Stree')\n17 functools.partial(<function check_estimator_sparse_data at 0x12aaaab00>, 'Stree')\n18 functools.partial(<function check_estimators_pickle at 0x12aab08c0>, 'Stree')\n19 functools.partial(<function check_classifier_data_not_an_array at 0x12aabb560>, 'Stree')\n20 functools.partial(<function check_classifiers_one_label at 0x12aab0f80>, 'Stree')\n21 functools.partial(<function check_classifiers_classes at 0x12aab69e0>, 'Stree')\n22 functools.partial(<function check_estimators_partial_fit_n_features at 0x12aab09e0>, 'Stree')\n23 functools.partial(<function check_classifiers_train at 0x12aab60e0>, 'Stree')\n24 functools.partial(<function check_classifiers_train at 0x12aab60e0>, 'Stree', readonly_memmap=True)\n25 functools.partial(<function check_classifiers_train at 0x12aab60e0>, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(<function check_classifiers_regression_target at 0x12aabf050>, 'Stree')\n27 functools.partial(<function check_supervised_y_no_nan at 0x12aaa0c20>, 'Stree')\n28 functools.partial(<function check_supervised_y_2d at 0x12aab6680>, 'Stree')\n29 functools.partial(<function check_estimators_unfitted at 0x12aab6560>, 'Stree')\n30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x12aabbb90>, 'Stree')\n31 functools.partial(<function check_decision_proba_consistency at 0x12aabf170>, 'Stree')\n32 functools.partial(<function check_fit2d_predict1d at 0x12aaac7a0>, 'Stree')\n33 functools.partial(<function check_methods_subset_invariance at 0x12aaac950>, 'Stree')\n34 functools.partial(<function check_fit2d_1sample at 0x12aaaca70>, 'Stree')\n35 functools.partial(<function check_fit2d_1feature at 0x12aaacb90>, 'Stree')\n36 functools.partial(<function check_fit1d at 0x12aaaccb0>, 'Stree')\n37 functools.partial(<function check_get_params_invariance at 0x12aabbdd0>, 'Stree')\n38 functools.partial(<function check_set_params at 0x12aabbef0>, 'Stree')\n39 functools.partial(<function check_dict_unchanged at 0x12aaac3b0>, 'Stree')\n40 functools.partial(<function check_dont_overwrite_parameters at 0x12aaac680>, 'Stree')\n41 functools.partial(<function check_fit_idempotent at 0x12aabf320>, 'Stree')\n42 functools.partial(<function check_n_features_in at 0x12aabf3b0>, 'Stree')\n"
"text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x12a2f1200>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x12a2e7320>, 'Stree')\n3 functools.partial(<function check_fit_score_takes_y at 0x12a2e7200>, 'Stree')\n4 functools.partial(<function check_sample_weights_pandas_series at 0x12a2d7b00>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x12a2d7c20>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x12a2d7d40>, 'Stree')\n7 functools.partial(<function check_sample_weights_shape at 0x12a2d7e60>, 'Stree')\n8 functools.partial(<function check_sample_weights_invariance at 0x12a2d7f80>, 'Stree')\n9 functools.partial(<function check_estimators_fit_returns_self at 0x12a2ec320>, 'Stree')\n10 functools.partial(<function check_estimators_fit_returns_self at 0x12a2ec320>, 'Stree', readonly_memmap=True)\n11 functools.partial(<function check_complex_data at 0x12a2e2170>, 'Stree')\n12 functools.partial(<function check_dtype_object at 0x12a2e20e0>, 'Stree')\n13 functools.partial(<function check_estimators_empty_data_messages at 0x12a2e7440>, 'Stree')\n14 functools.partial(<function check_pipeline_consistency at 0x12a2e70e0>, 'Stree')\n15 functools.partial(<function check_estimators_nan_inf at 0x12a2e7560>, 'Stree')\n16 functools.partial(<function check_estimators_overwrite_params at 0x12a2f10e0>, 'Stree')\n17 functools.partial(<function check_estimator_sparse_data at 0x12a2d79e0>, 'Stree')\n18 functools.partial(<function check_estimators_pickle at 0x12a2e77a0>, 'Stree')\n19 functools.partial(<function check_classifier_data_not_an_array at 0x12a2f1440>, 'Stree')\n20 functools.partial(<function check_classifiers_one_label at 0x12a2e7e60>, 'Stree')\n21 functools.partial(<function check_classifiers_classes at 0x12a2ec8c0>, 'Stree')\n22 functools.partial(<function check_estimators_partial_fit_n_features at 0x12a2e78c0>, 'Stree')\n23 functools.partial(<function check_classifiers_train at 0x12a2e7f80>, 'Stree')\n24 functools.partial(<function check_classifiers_train at 0x12a2e7f80>, 'Stree', readonly_memmap=True)\n25 functools.partial(<function check_classifiers_train at 0x12a2e7f80>, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(<function check_classifiers_regression_target at 0x12a2f1ef0>, 'Stree')\n27 functools.partial(<function check_supervised_y_no_nan at 0x12a2d8b00>, 'Stree')\n28 functools.partial(<function check_supervised_y_2d at 0x12a2ec560>, 'Stree')\n29 functools.partial(<function check_estimators_unfitted at 0x12a2ec440>, 'Stree')\n30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x12a2f1a70>, 'Stree')\n31 functools.partial(<function check_decision_proba_consistency at 0x12a2f6050>, 'Stree')\n32 functools.partial(<function check_fit2d_predict1d at 0x12a2e2680>, 'Stree')\n33 functools.partial(<function check_methods_subset_invariance at 0x12a2e2830>, 'Stree')\n34 functools.partial(<function check_fit2d_1sample at 0x12a2e2950>, 'Stree')\n35 functools.partial(<function check_fit2d_1feature at 0x12a2e2a70>, 'Stree')\n36 functools.partial(<function check_fit1d at 0x12a2e2b90>, 'Stree')\n37 functools.partial(<function check_get_params_invariance at 0x12a2f1cb0>, 'Stree')\n38 functools.partial(<function check_set_params at 0x12a2f1dd0>, 'Stree')\n39 functools.partial(<function check_dict_unchanged at 0x12a2e2290>, 'Stree')\n40 functools.partial(<function check_dont_overwrite_parameters at 0x12a2e2560>, 'Stree')\n41 functools.partial(<function check_fit_idempotent at 0x12a2f6200>, 'Stree')\n42 functools.partial(<function check_n_features_in at 0x12a2f6290>, 'Stree')\n43 functools.partial(<function check_requires_y_none at 0x12a2f6320>, 'Stree')\n"
}
],
"source": [

View File

@@ -3,6 +3,7 @@ import setuptools
__version__ = "0.9rc3"
__author__ = "Ricardo Montañana Gómez"
def readme():
with open('README.md') as f:
return f.read()
@@ -19,7 +20,8 @@ setuptools.setup(
url='https://github.com/doctorado-ml/stree',
author=__author__,
author_email='ricardo.montanana@alu.uclm.es',
keywords='scikit-learn oblique-classifier oblique-decision-tree decision-tree svm svc',
keywords='scikit-learn oblique-classifier oblique-decision-tree decision-\
tree svm svc',
classifiers=[
'Development Status :: 4 - Beta',
'License :: OSI Approved :: MIT License',

View File

@@ -7,23 +7,28 @@ Build an oblique tree classifier based on SVM Trees
Uses LinearSVC
'''
import typing
import os
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import LinearSVC
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, _check_sample_weight, check_random_state
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, \
_check_sample_weight
class Snode:
def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str):
"""Nodes of the tree that keeps the svm classifier and if testing the
dataset assigned to it
"""
def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray,
title: str):
self._clf = clf
self._vector = None if clf is None else clf.coef_
self._interceptor = 0. if clf is None else clf.intercept_
self._title = title
self._belief = 0. # belief of the prediction in a leaf node based on samples
self._belief = 0.
# Only store dataset in Testing
self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None
self._y = y
@@ -51,8 +56,8 @@ class Snode:
return self._up
def make_predictor(self):
"""Compute the class of the predictor and its belief based on the subdataset of the node
only if it is a leaf
"""Compute the class of the predictor and its belief based on the
subdataset of the node only if it is a leaf
"""
if not self.is_leaf():
return
@@ -62,7 +67,7 @@ class Snode:
min_card = min(card)
try:
self._belief = max_card / (max_card + min_card)
except:
except ZeroDivisionError:
self._belief = 0.
self._class = classes[card == max_card][0]
else:
@@ -71,7 +76,10 @@ class Snode:
def __str__(self) -> str:
if self.is_leaf():
return f"{self._title} - Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}"
count_values = np.unique(self._y, return_counts=True)
result = f"{self._title} - Leaf class={self._class} belief="\
f"{self._belief: .6f} counts={count_values}"
return result
else:
return f"{self._title}"
@@ -101,11 +109,16 @@ class Siterator:
class Stree(BaseEstimator, ClassifierMixin):
"""
"""Estimator that is based on binary trees of svm nodes
can deal with sample_weights in predict, used in boosting sklearn methods
inheriting from BaseEstimator implements get_params and set_params methods
inheriting from ClassifierMixin implement the attribute _estimator_type
with "classifier" as value
"""
def __init__(self, C: float = 1.0, max_iter: int = 1000, random_state: int = None,
max_depth: int=None, tol: float=1e-4, use_predictions: bool = False):
def __init__(self, C: float = 1.0, max_iter: int = 1000,
random_state: int = None, max_depth: int = None,
tol: float = 1e-4, use_predictions: bool = False):
self.max_iter = max_iter
self.C = C
self.random_state = random_state
@@ -113,65 +126,100 @@ class Stree(BaseEstimator, ClassifierMixin):
self.max_depth = max_depth
self.tol = tol
def get_params(self, deep: bool=True) -> dict:
"""Get dict with hyperparameters and its values to accomplish sklearn rules
"""
return {
'C': self.C,
'random_state': self.random_state,
'max_iter': self.max_iter,
'use_predictions': self.use_predictions,
'max_depth': self.max_depth,
'tol': self.tol
}
def set_params(self, **parameters: dict):
"""Set hyperparmeters as specified by sklearn, needed in Gridsearchs
"""
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self
# Added binary_only tag as required by sklearn check_estimator
def _more_tags(self) -> dict:
return {'binary_only': True}
"""Required by sklearn to tell that this estimator is a binary classifier
:return: the tag required
:rtype: dict
"""
return {'binary_only': True, 'requires_y': True}
def _linear_function(self, data: np.array, node: Snode) -> np.array:
"""Compute the distance of set of samples to a hyperplane, in
multiclass classification it should compute the distance to a
hyperplane of each class
:param data: dataset of samples
:type data: np.array
:param node: the node that contains the hyperplance coefficients
:type node: Snode
:return: array of distances of each sample to the hyperplane
:rtype: np.array
"""
coef = node._vector[0, :].reshape(-1, data.shape[1])
return data.dot(coef.T) + node._interceptor[0]
def _split_array(self, origin: np.array, down: np.array) -> list:
"""Split an array in two based on indices passed as down and its complement
:param origin: dataset to split
:type origin: np.array
:param down: indices to use to split array
:type down: np.array
:return: list with two splits of the array
:rtype: list
"""
up = ~down
return origin[up[:, 0]] if any(up) else None, \
origin[down[:, 0]] if any(down) else None
def _distances(self, node: Snode, data: np.ndarray) -> np.array:
"""Compute distances of the samples to the hyperplane of the node
:param node: node containing the svm classifier
:type node: Snode
:param data: samples to find out distance to hyperplane
:type data: np.ndarray
:return: array of shape (m, 1) with the distances of every sample to
the hyperplane of the node
:rtype: np.array
"""
if self.use_predictions:
res = np.expand_dims(node._clf.decision_function(data), 1)
else:
# doesn't work with multiclass as each sample has to do inner product with its own coeficients
# computes positition of every sample is w.r.t. the hyperplane
"""doesn't work with multiclass as each sample has to do inner
product with its own coefficients computes positition of every
sample is w.r.t. the hyperplane
"""
res = self._linear_function(data, node)
return res
def _split_criteria(self, data: np.array) -> np.array:
"""Set the criteria to split arrays
:param data: [description]
:type data: np.array
:return: [description]
:rtype: np.array
"""
return data > 0
def fit(self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None) -> 'Stree':
def fit(self, X: np.ndarray, y: np.ndarray,
sample_weight: np.array = None) -> 'Stree':
"""Build the tree based on the dataset of samples and its labels
:raises ValueError: if parameters C or max_depth are out of bounds
:return: itself to be able to chain actions: fit().predict() ...
:rtype: Stree
"""
# Check parameters are Ok.
if type(y).__name__ == 'np.ndarray':
y = y.ravel()
if self.C < 0:
raise ValueError(f"Penalty term must be positive... got (C={self.C:f})")
self.__max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth
raise ValueError(
f"Penalty term must be positive... got (C={self.C:f})")
self.__max_depth = np.iinfo(
np.int32).max if self.max_depth is None else self.max_depth
if self.__max_depth < 1:
raise ValueError(f"Maximum depth has to be greater than 1... got (max_depth={self.max_depth})")
raise ValueError(
f"Maximum depth has to be greater than 1... got (max_depth=\
{self.max_depth})")
check_classification_targets(y)
X, y = check_X_y(X, y)
sample_weight = _check_sample_weight(sample_weight, X)
check_classification_targets(y)
# Initialize computed parameters
self.classes_ = np.unique(y)
self.classes_, y = np.unique(y, return_inverse=True)
self.n_iter_ = self.max_iter
self.depth_ = 0
self.n_features_in_ = X.shape[1]
@@ -182,7 +230,6 @@ class Stree(BaseEstimator, ClassifierMixin):
def _build_predictor(self):
"""Process the leaves to make them predictors
"""
def run_tree(node: Snode):
if node.is_leaf():
node.make_predictor()
@@ -192,16 +239,32 @@ class Stree(BaseEstimator, ClassifierMixin):
run_tree(self.tree_)
def train(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray, depth: int, title: str) -> Snode:
def train(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray,
depth: int, title: str) -> Snode:
"""Recursive function to split the original dataset into predictor
nodes (leaves)
:param X: samples dataset
:type X: np.ndarray
:param y: samples labels
:type y: np.ndarray
:param sample_weight: weight of samples (used in boosting)
:type sample_weight: np.ndarray
:param depth: actual depth in the tree
:type depth: int
:param title: description of the node
:type title: str
:return: binary tree
:rtype: Snode
"""
if depth > self.__max_depth:
return None
if np.unique(y).shape[0] == 1 :
if np.unique(y).shape[0] == 1:
# only 1 class => pure dataset
return Snode(None, X, y, title + ', <pure>')
# Train the model
clf = LinearSVC(max_iter=self.max_iter, random_state=self.random_state,
C=self.C) #, sample_weight=sample_weight)
C=self.C) # , sample_weight=sample_weight)
clf.fit(X, y, sample_weight=sample_weight)
tree = Snode(clf, X, y, title)
self.depth_ = max(depth, self.depth_)
@@ -217,6 +280,15 @@ class Stree(BaseEstimator, ClassifierMixin):
return tree
def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
"""Reorder an array based on the array of indices passed
:param y: data untidy
:type y: np.array
:param indices: indices used to set order
:type indices: np.array
:return: array y ordered
:rtype: np.array
"""
if y.ndim > 1 and y.shape[1] > 1:
# if predict_proba return np.array of floats
y_ordered = np.zeros(y.shape, dtype=float)
@@ -229,7 +301,15 @@ class Stree(BaseEstimator, ClassifierMixin):
return y_ordered
def predict(self, X: np.array) -> np.array:
def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array:
"""Predict labels for each sample in dataset passed
:param X: dataset of samples
:type X: np.array
:return: array of labels
:rtype: np.array
"""
def predict_class(xp: np.array, indices: np.array,
node: Snode) -> np.array:
if xp is None:
return [], []
if node.is_leaf():
@@ -242,29 +322,36 @@ class Stree(BaseEstimator, ClassifierMixin):
prx_u, prin_u = predict_class(X_U, i_u, node.get_up())
prx_d, prin_d = predict_class(X_D, i_d, node.get_down())
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
# sklearn check
check_is_fitted(self, ['tree_'])
# Input validation
X = check_array(X)
# setup prediction & make it happen
indices = np.arange(X.shape[0])
return self._reorder_results(*predict_class(X, indices, self.tree_)).ravel()
result = self._reorder_results(
*predict_class(X, indices, self.tree_)).astype(int).ravel()
return self.classes_[result]
def predict_proba(self, X: np.array) -> np.array:
"""Computes an approximation of the probability of samples belonging to class 0 and 1
"""Computes an approximation of the probability of samples belonging to
class 0 and 1
:param X: dataset
:type X: np.array
:return: array array of shape (m, num_classes), probability of being
each class
:rtype: np.array
"""
def predict_class(xp: np.array, indices: np.array, dist: np.array, node: Snode) -> np.array:
def predict_class(xp: np.array, indices: np.array, dist: np.array,
node: Snode) -> np.array:
"""Run the tree to compute predictions
:param xp: subdataset of samples
:type xp: np.array
:param indices: indices of subdataset samples to rebuild original order
:param indices: indices of subdataset samples to rebuild original
order
:type indices: np.array
:param dist: distances of every sample to the hyperplane or the father node
:param dist: distances of every sample to the hyperplane or the
father node
:type dist: np.array
:param node: node of the leaf with the class
:type node: Snode
@@ -280,7 +367,6 @@ class Stree(BaseEstimator, ClassifierMixin):
return np.append(prediction, prediction_proba, axis=1), indices
distances = self._distances(node, xp)
down = self._split_criteria(distances)
X_U, X_D = self._split_array(xp, down)
i_u, i_d = self._split_array(indices, down)
di_u, di_d = self._split_array(distances, down)
@@ -297,15 +383,24 @@ class Stree(BaseEstimator, ClassifierMixin):
empty_dist = np.empty((X.shape[0], 1), dtype=float)
result, indices = predict_class(X, indices, empty_dist, self.tree_)
result = result.reshape(X.shape[0], 2)
# Turn distances to hyperplane into probabilities based on fitting distances
# of samples to its hyperplane that classified them, to the sigmoid function
# Turn distances to hyperplane into probabilities based on fitting
# distances of samples to its hyperplane that classified them, to the
# sigmoid function
# Probability of being 1
result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))
result[:, 0] = 1 - result[:, 1] # Probability of being 0
# Probability of being 0
result[:, 0] = 1 - result[:, 1]
return self._reorder_results(result, indices)
def score(self, X: np.array, y: np.array) -> float:
"""Return accuracy
"""Compute accuracy of the prediction
:param X: dataset of samples to make predictions
:type X: np.array
:param y: samples labels
:type y: np.array
:return: accuracy of the prediction
:rtype: float
"""
# sklearn check
check_is_fitted(self)
@@ -313,15 +408,25 @@ class Stree(BaseEstimator, ClassifierMixin):
return np.mean(yp == y)
def __iter__(self) -> Siterator:
"""Create an iterator to be able to visit the nodes of the tree in preorder,
can make a list with all the nodes in preorder
:return: an iterator, can for i in... and list(...)
:rtype: Siterator
"""
try:
tree = self.tree_
except:
except AttributeError:
tree = None
return Siterator(tree)
def __str__(self) -> str:
"""String representation of the tree
:return: description of nodes in the tree in preorder
:rtype: str
"""
output = ''
for i in self:
output += str(i) + '\n'
return output

View File

@@ -1,4 +1,3 @@
import csv
import os
import unittest
@@ -22,18 +21,22 @@ class Stree_test(unittest.TestCase):
def tearDownClass(cls):
try:
os.environ.pop('TESTING')
except:
except KeyError:
pass
def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
X, y = make_classification(n_samples=1500, n_features=3,
n_informative=3, n_redundant=0,
n_repeated=0, n_classes=2,
n_clusters_per_class=2, class_sep=1.5,
flip_y=0, weights=[0.5, 0.5],
random_state=self._random_state)
return X, y
def _check_tree(self, node: Snode):
"""Check recursively that the nodes that are not leaves have the correct
number of labels and its sons have the right number of elements in their dataset
"""Check recursively that the nodes that are not leaves have the
correct number of labels and its sons have the right number of elements
in their dataset
Arguments:
node {Snode} -- node to check
@@ -53,11 +56,11 @@ class Stree_test(unittest.TestCase):
for i in unique_y:
try:
number_down = count_d[i]
except:
except IndexError:
number_down = 0
try:
number_up = count_u[i]
except:
except IndexError:
number_up = 0
self.assertEqual(count_y[i], number_down + number_up)
# Is the partition made the same as the prediction?
@@ -89,7 +92,8 @@ class Stree_test(unittest.TestCase):
fx = np.delete(data, column_y, axis=1)
return fx, fy
def _find_out(self, px: np.array, x_original: np.array, y_original) -> list:
def _find_out(self, px: np.array, x_original: np.array,
y_original) -> list:
"""Find the original values of y for a given array of samples
Arguments:
@@ -128,16 +132,18 @@ class Stree_test(unittest.TestCase):
self.assertGreater(accuracy_score, 0.9)
def test_single_predict_proba(self):
"""Check that element 28 has a prediction different that the current label
"""Check that element 28 has a prediction different that the current
label
"""
# Element 28 has a different prediction than the truth
decimals = 5
prob = 0.29026400766
X, y = self._get_Xy()
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
self.assertEqual(np.round(1 - prob, decimals), np.round(yp[0:, 0], decimals))
self.assertEqual(np.round(1 - prob, decimals),
np.round(yp[0:, 0], decimals))
self.assertEqual(1, y[28])
self.assertAlmostEqual(
round(prob, decimals),
round(yp[0, 1], decimals),
@@ -150,11 +156,16 @@ class Stree_test(unittest.TestCase):
decimals = 5
X, y = self._get_Xy()
yp = self._clf.predict_proba(X[:num, :])
self.assertListEqual(y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist())
expected_proba = [0.88395641, 0.36746962, 0.84158767, 0.34106833, 0.14269291, 0.85193236,
0.29876058, 0.7282164, 0.85958616, 0.89517877, 0.99745224, 0.18860349,
0.30756427, 0.8318412, 0.18981198, 0.15564624, 0.25740655, 0.22923355,
0.87365959, 0.49928689, 0.95574351, 0.28761257, 0.28906333, 0.32643692,
self.assertListEqual(
y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist())
expected_proba = [0.88395641, 0.36746962, 0.84158767, 0.34106833,
0.14269291, 0.85193236,
0.29876058, 0.7282164, 0.85958616, 0.89517877,
0.99745224, 0.18860349,
0.30756427, 0.8318412, 0.18981198, 0.15564624,
0.25740655, 0.22923355,
0.87365959, 0.49928689, 0.95574351, 0.28761257,
0.28906333, 0.32643692,
0.29788483, 0.01657364, 0.81149083]
expected = np.round(expected_proba, decimals=decimals).tolist()
computed = np.round(yp[:, 1], decimals=decimals).tolist()
@@ -162,9 +173,10 @@ class Stree_test(unittest.TestCase):
self.assertAlmostEqual(expected[i], computed[i], decimals)
def build_models(self):
"""Build and train two models, model_clf will use the sklearn classifier to
compute predictions and split data. model_computed will use vector of
coefficients to compute both predictions and splitted data
"""Build and train two models, model_clf will use the sklearn
classifier to compute predictions and split data. model_computed will
use vector of coefficients to compute both predictions and splitted
data
"""
model_clf = Stree(random_state=self._random_state,
use_predictions=True)
@@ -176,8 +188,9 @@ class Stree_test(unittest.TestCase):
return model_clf, model_computed, X, y
def test_use_model_predict(self):
"""Check that we get the same results wether we use the estimator in nodes
to compute labels or we use the hyperplane and the position of samples wrt to it
"""Check that we get the same results wether we use the estimator in
nodes to compute labels or we use the hyperplane and the position of
samples wrt to it
"""
use_clf, use_math, X, _ = self.build_models()
self.assertListEqual(
@@ -202,14 +215,15 @@ class Stree_test(unittest.TestCase):
)
def test_single_vs_multiple_prediction(self):
"""Check if predicting sample by sample gives the same result as predicting
all samples at once
"""Check if predicting sample by sample gives the same result as
predicting all samples at once
"""
X, _ = self._get_Xy()
# Compute prediction line by line
yp_line = np.array([], dtype=int)
for xp in X:
yp_line = np.append(yp_line, self._clf.predict(xp.reshape(-1, X.shape[1])))
yp_line = np.append(yp_line, self._clf.predict(
xp.reshape(-1, X.shape[1])))
# Compute prediction at once
yp_once = self._clf.predict(X)
#
@@ -221,11 +235,15 @@ class Stree_test(unittest.TestCase):
expected = [
'root',
'root - Down',
'root - Down - Down, <cgaf> - Leaf class=1 belief=0.975989 counts=(array([0, 1]), array([ 17, 691]))',
'root - Down - Down, <cgaf> - Leaf class=1 belief= 0.975989 counts'
'=(array([0, 1]), array([ 17, 691]))',
'root - Down - Up',
'root - Down - Up - Down, <cgaf> - Leaf class=1 belief=0.750000 counts=(array([0, 1]), array([1, 3]))',
'root - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([7]))',
'root - Up, <cgaf> - Leaf class=0 belief=0.928297 counts=(array([0, 1]), array([725, 56]))',
'root - Down - Up - Down, <cgaf> - Leaf class=1 belief= 0.750000 '
'counts=(array([0, 1]), array([1, 3]))',
'root - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 '
'counts=(array([0]), array([7]))',
'root - Up, <cgaf> - Leaf class=0 belief= 0.928297 counts=(array('
'[0, 1]), array([725, 56]))',
]
computed = []
for node in self._clf:
@@ -253,10 +271,10 @@ class Stree_test(unittest.TestCase):
with self.assertRaises(ValueError):
tcl = Stree(max_depth=-1)
tcl.fit(*self._get_Xy())
def test_check_max_depth(self):
depth = 3
tcl = Stree(random_state=self._random_state, max_depth=depth)
tcl = Stree(random_state=self._random_state, max_depth=depth)
tcl.fit(*self._get_Xy())
self.assertEqual(depth, tcl.depth_)
@@ -264,6 +282,7 @@ class Stree_test(unittest.TestCase):
tcl = Stree()
self.assertEqual(0, len(list(tcl)))
class Snode_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
@@ -276,19 +295,24 @@ class Snode_test(unittest.TestCase):
@classmethod
def tearDownClass(cls):
"""[summary]
"""
try:
os.environ.pop('TESTING')
except:
except KeyError:
pass
def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
X, y = make_classification(n_samples=1500, n_features=3,
n_informative=3, n_redundant=0, n_classes=2,
n_repeated=0, n_clusters_per_class=2,
class_sep=1.5, flip_y=0, weights=[0.5, 0.5],
random_state=self._random_state)
return X, y
def test_attributes_in_leaves(self):
"""Check if the attributes in leaves have correct values so they form a predictor
"""Check if the attributes in leaves have correct values so they form a
predictor
"""
def check_leave(node: Snode):
@@ -303,7 +327,7 @@ class Snode_test(unittest.TestCase):
if len(classes) > 1:
try:
belief = max_card / (max_card + min_card)
except:
except ZeroDivisionError:
belief = 0.
else:
belief = 1