Compare commits

...

10 Commits

Author SHA1 Message Date
5956cd0cd2 Update google colab setup in notebooks
Undate save_all in grapher to make dest. folder if it doesn't exist
2020-05-24 20:13:27 +02:00
27b278860d Fix install from scratch 2020-05-24 18:47:55 +02:00
d5d723c67f update setup.py to include tests suite 2020-05-23 23:59:03 +02:00
77f10281c1 Make project python package friendly
- Add setup.py
- Move classes to module files
- Move tests folder inside module folder
2020-05-23 23:40:33 +02:00
ac1483ae1d update requirements to alllow maptlot widget 2020-05-23 00:05:58 +02:00
e51690ed95 Implement grapher and notebook to test it 2020-05-22 19:42:13 +02:00
a4595f5815 Update notebooks and readme with cosmetic changes 2020-05-20 18:11:57 +02:00
316f84cc63 Fix precision issues in tests executed in Travis 2020-05-20 15:02:31 +02:00
6e35628c85 Grapher working 2020-05-20 14:26:55 +02:00
c0ef71f139 first approx to grapher 2020-05-20 12:32:17 +02:00
21 changed files with 980 additions and 319 deletions

View File

@@ -10,4 +10,4 @@ notifications:
on_success: never # default: change
on_failure: always # default: always
# command to run tests
script: python -m unittest tests.Stree_test tests.Snode_test
script: python -m unittest stree.tests

View File

@@ -4,20 +4,38 @@
Oblique Tree classifier based on SVM nodes
## Example
![Stree](https://raw.github.com/doctorado-ml/stree/master/example.png)
### Jupyter
## Installation
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/test.ipynb)
```bash
pip install git+https://github.com/doctorado-ml/stree
```
## Examples
### Jupyter notebooks
##### Slow launch but better integration
* [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/test.ipynb) Test notebook
##### Fast launch but have to run first commented out cell for setup
* [![Test](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/test.ipynb) Test notebook
* [![Test2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/test2.ipynb) Another Test notebook
* [![Test Graphics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/test_graphs.ipynb) Test Graphics notebook
### Command line
```python
```bash
python main.py
```
## Tests
```python
python -m unittest -v tests.Stree_test tests.Snode_test
```bash
python -m unittest -v stree.tests
```

199
crcard_graphs.ipynb Normal file

File diff suppressed because one or more lines are too long

3
data/.gitignore vendored
View File

@@ -1,2 +1 @@
*.csv
*.txt
*

BIN
example.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.1 MiB

11
main.py
View File

@@ -1,6 +1,6 @@
import time
from sklearn.model_selection import train_test_split
from trees.Stree import Stree
from stree import Stree
random_state=1
@@ -50,9 +50,8 @@ print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
proba = clf.predict_proba(Xtest)
print("Checking that we have correct probabilities, these are probabilities of sample belonging to class 1")
res0 = proba[proba[:, 0] == 0]
res1 = proba[proba[:, 0] == 0]
print("++++++++++res0++++++++++++")
res1 = proba[proba[:, 0] == 1]
print("++++++++++res0 > .8++++++++++++")
print(res0[res0[:, 1] > .8])
print("**********res1************")
print(res1[res1[:, 1] < .4])
print(clf.predict_proba(Xtest))
print("**********res1 < .4************")
print(res1[res1[:, 1] < .4])

View File

@@ -1,3 +1,5 @@
numpy==1.18.2
scikit-learn==0.22.2
pandas==1.0.3
numpy
scikit-learn
pandas
matplotlib
ipympl

39
setup.py Normal file
View File

@@ -0,0 +1,39 @@
import setuptools
__version__ = "0.9rc2"
__author__ = "Ricardo Montañana Gómez"
def readme():
with open('README.md') as f:
return f.read()
setuptools.setup(
name='STree',
version=__version__,
license='MIT License',
description='Oblique decision tree with svm nodes',
long_description=readme(),
long_description_content_type='text/markdown',
packages=setuptools.find_packages(),
url='https://github.com/doctorado-ml/stree',
author=__author__,
author_email='ricardo.montanana@alu.uclm.es',
keywords='scikit-learn oblique-classifier oblique-decision-tree decision-tree svm svc',
classifiers=[
'Development Status :: 4 - Beta',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3.7',
'Natural Language :: English',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Intended Audience :: Science/Research'
],
install_requires=[
'scikit-learn>=0.23.0',
'numpy',
'matplotlib',
'ipympl'
],
test_suite="stree.tests",
zip_safe=False
)

View File

@@ -8,21 +8,100 @@ Uses LinearSVC
'''
import typing
import os
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import LinearSVC
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from trees.Snode import Snode
from trees.Siterator import Siterator
class Snode:
def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str):
self._clf = clf
self._vector = None if clf is None else clf.coef_
self._interceptor = 0. if clf is None else clf.intercept_
self._title = title
self._belief = 0. # belief of the prediction in a leaf node based on samples
# Only store dataset in Testing
self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None
self._y = y
self._down = None
self._up = None
self._class = None
@classmethod
def copy(cls, node: 'Snode') -> 'Snode':
return cls(node._clf, node._X, node._y, node._title)
def set_down(self, son):
self._down = son
def set_up(self, son):
self._up = son
def is_leaf(self) -> bool:
return self._up is None and self._down is None
def get_down(self) -> 'Snode':
return self._down
def get_up(self) -> 'Snode':
return self._up
def make_predictor(self):
"""Compute the class of the predictor and its belief based on the subdataset of the node
only if it is a leaf
"""
if not self.is_leaf():
return
classes, card = np.unique(self._y, return_counts=True)
if len(classes) > 1:
max_card = max(card)
min_card = min(card)
try:
self._belief = max_card / (max_card + min_card)
except:
self._belief = 0.
self._class = classes[card == max_card][0]
else:
self._belief = 1
self._class = classes[0]
def __str__(self) -> str:
if self.is_leaf():
return f"{self._title} - Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}"
else:
return f"{self._title}"
class Siterator:
"""Stree preorder iterator
"""
def __init__(self, tree: Snode):
self._stack = []
self._push(tree)
def __iter__(self):
return self
def _push(self, node: Snode):
if node is not None:
self._stack.append(node)
def __next__(self) -> Snode:
if len(self._stack) == 0:
raise StopIteration()
node = self._stack.pop()
self._push(node.get_up())
self._push(node.get_down())
return node
class Stree(BaseEstimator, ClassifierMixin):
"""
"""
def __init__(self, C=1.0, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False):
def __init__(self, C: float = 1.0, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False):
self._max_iter = max_iter
self._C = C
self._random_state = random_state
@@ -78,12 +157,14 @@ class Stree(BaseEstimator, ClassifierMixin):
def _build_predictor(self):
"""Process the leaves to make them predictors
"""
def run_tree(node: Snode):
if node.is_leaf():
node.make_predictor()
return
run_tree(node.get_down())
run_tree(node.get_up())
run_tree(self._tree)
def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
@@ -122,6 +203,7 @@ class Stree(BaseEstimator, ClassifierMixin):
k, l = predict_class(d, i_d, node.get_down())
m, n = predict_class(u, i_u, node.get_up())
return np.append(k, m), np.append(l, n)
# sklearn check
check_is_fitted(self)
# Input validation
@@ -137,6 +219,7 @@ class Stree(BaseEstimator, ClassifierMixin):
:param X: dataset
:type X: np.array
"""
def predict_class(xp: np.array, indices: np.array, dist: np.array, node: Snode) -> np.array:
"""Run the tree to compute predictions
@@ -162,6 +245,7 @@ class Stree(BaseEstimator, ClassifierMixin):
k, l = predict_class(d, i_d, r_d, node.get_down())
m, n = predict_class(u, i_u, r_u, node.get_up())
return np.append(k, m), np.append(l, n)
# sklearn check
check_is_fitted(self)
# Input validation
@@ -218,5 +302,10 @@ class Stree(BaseEstimator, ClassifierMixin):
def save_sub_datasets(self):
"""Save the every dataset stored in the tree to check with manual classifier
"""
if not os.path.isdir(self.__folder):
os.mkdir(self.__folder)
with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog:
self._save_datasets(self._tree, catalog, 1)

184
stree/Strees_grapher.py Normal file
View File

@@ -0,0 +1,184 @@
'''
__author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
__license__ = "MIT"
__version__ = "0.9"
Plot 3D views of nodes in Stree
'''
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from .Strees import Stree, Snode, Siterator
class Snode_graph(Snode):
def __init__(self, node: Stree):
self._plot_size = (8, 8)
self._xlimits = (None, None)
self._ylimits = (None, None)
self._zlimits = (None, None)
n = Snode.copy(node)
super().__init__(n._clf, n._X, n._y, n._title)
def set_plot_size(self, size: tuple):
self._plot_size = size
def _is_pure(self) -> bool:
"""is considered pure a leaf node with one label
"""
if self.is_leaf():
return self._belief == 1.
return False
def set_axis_limits(self, limits: tuple):
self._xlimits = limits[0]
self._ylimits = limits[1]
self._zlimits = limits[2]
def _set_graphics_axis(self, ax: Axes3D):
ax.set_xlim(self._xlimits)
ax.set_ylim(self._ylimits)
ax.set_zlim(self._zlimits)
def save_hyperplane(self, save_folder: str = './', save_prefix: str = '', save_seq: int = 1):
_, fig = self.plot_hyperplane()
name = f"{save_folder}{save_prefix}STnode{save_seq}.png"
fig.savefig(name, bbox_inches='tight')
plt.close(fig)
def _get_cmap(self):
cmap = 'jet'
if self._is_pure():
if self._class == 1:
cmap = 'jet_r'
return cmap
def _graph_title(self):
n_class, card = np.unique(self._y, return_counts=True)
return f"{self._title} {n_class} {card}"
def plot_hyperplane(self, plot_distribution: bool = True):
fig = plt.figure(figsize=self._plot_size)
ax = fig.add_subplot(1, 1, 1, projection='3d')
if not self._is_pure():
# Can't plot hyperplane of leaves with one label because it hasn't classiffier
# get the splitting hyperplane
def hyperplane(x, y): return (-self._interceptor - self._vector[0][0] * x
- self._vector[0][1] * y) / self._vector[0][2]
tmpx = np.linspace(self._X[:, 0].min(), self._X[:, 0].max())
tmpy = np.linspace(self._X[:, 1].min(), self._X[:, 1].max())
xx, yy = np.meshgrid(tmpx, tmpy)
ax.plot_surface(xx, yy, hyperplane(xx, yy), alpha=.5, antialiased=True,
rstride=1, cstride=1, cmap='seismic')
self._set_graphics_axis(ax)
if plot_distribution:
self.plot_distribution(ax)
else:
plt.title(self._graph_title())
plt.show()
return ax, fig
def plot_distribution(self, ax: Axes3D = None):
if ax is None:
fig = plt.figure(figsize=self._plot_size)
ax = fig.add_subplot(1, 1, 1, projection='3d')
plt.title(self._graph_title())
cmap = self._get_cmap()
ax.scatter(self._X[:, 0], self._X[:, 1],
self._X[:, 2], c=self._y, cmap=cmap)
ax.set_xlabel('X0')
ax.set_ylabel('X1')
ax.set_zlabel('X2')
plt.show()
class Stree_grapher(Stree):
"""Build 3d graphs of any dataset, if it's more than 3 features PCA shall
make its magic
"""
def __init__(self, params: dict):
self._plot_size = (8, 8)
self._tree_gr = None
# make Snode store X's
os.environ['TESTING'] = '1'
self._fitted = False
self._pca = None
super().__init__(**params)
def __del__(self):
try:
os.environ.pop('TESTING')
except:
pass
plt.close('all')
def _copy_tree(self, node: Snode) -> Snode_graph:
mirror = Snode_graph(node)
# clone node
mirror._class = node._class
mirror._belief = node._belief
if node.get_down() is not None:
mirror.set_down(self._copy_tree(node.get_down()))
if node.get_up() is not None:
mirror.set_up(self._copy_tree(node.get_up()))
return mirror
def fit(self, X: np.array, y: np.array) -> Stree:
"""Fit the Stree and copy the tree in a Snode_graph tree
:param X: Dataset
:type X: np.array
:param y: Labels
:type y: np.array
:return: Stree model
:rtype: Stree
"""
if X.shape[1] != 3:
self._pca = PCA(n_components=3)
X = self._pca.fit_transform(X)
res = super().fit(X, y)
self._tree_gr = self._copy_tree(self._tree)
self._fitted = True
return res
def score(self, X: np.array, y: np.array) -> float:
self._check_fitted()
if X.shape[1] != 3:
X = self._pca.transform(X)
return super().score(X, y)
def _check_fitted(self):
if not self._fitted:
raise Exception('Have to fit the grapher first!')
def save_all(self, save_folder: str = './', save_prefix: str = ''):
"""Save all the node plots in png format, each with a sequence number
:param save_folder: folder where the plots are saved, defaults to './'
:type save_folder: str, optional
"""
self._check_fitted()
if not os.path.isdir(save_folder):
os.mkdir(save_folder)
seq = 1
for node in self:
node.save_hyperplane(save_folder=save_folder,
save_prefix=save_prefix, save_seq=seq)
seq += 1
def plot_all(self):
"""Plots all the nodes
"""
self._check_fitted()
for node in self:
node.plot_hyperplane()
def __iter__(self):
return Siterator(self._tree_gr)

2
stree/__init__.py Normal file
View File

@@ -0,0 +1,2 @@
from .Strees import Stree, Snode, Siterator
from .Strees_grapher import Stree_grapher, Snode_graph

View File

@@ -5,7 +5,7 @@ import unittest
import numpy as np
from sklearn.datasets import make_classification
from trees.Stree import Stree, Snode
from stree import Stree, Snode
class Stree_test(unittest.TestCase):
@@ -14,9 +14,9 @@ class Stree_test(unittest.TestCase):
os.environ['TESTING'] = '1'
self._random_state = 1
self._clf = Stree(random_state=self._random_state,
use_predictions=False)
use_predictions=False)
self._clf.fit(*self._get_Xy())
super(Stree_test, self).__init__(*args, **kwargs)
super().__init__(*args, **kwargs)
@classmethod
def tearDownClass(cls):
@@ -24,7 +24,7 @@ class Stree_test(unittest.TestCase):
os.environ.pop('TESTING')
except:
pass
def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
@@ -118,7 +118,7 @@ class Stree_test(unittest.TestCase):
x_file, y_file = self._get_file_data(row[0])
y_original = np.array(self._find_out(x_file, X, y), dtype=int)
self.assertTrue(np.array_equal(y_file, y_original))
def test_single_prediction(self):
X, y = self._get_Xy()
yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
@@ -139,29 +139,38 @@ class Stree_test(unittest.TestCase):
accuracy_computed = sum(right) / len(y)
self.assertEqual(accuracy_score, accuracy_computed)
self.assertGreater(accuracy_score, 0.8)
def test_single_predict_proba(self):
"""Check that element 28 has a prediction different that the current label
"""
# Element 28 has a different prediction than the truth
decimals = 5
X, y = self._get_Xy()
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
self.assertEqual(0, yp[0:, 0])
self.assertEqual(1, y[28])
self.assertEqual(0.29026400766, round(yp[0, 1], 11))
self.assertAlmostEqual(
round(0.29026400766, decimals),
round(yp[0, 1], decimals),
decimals
)
def test_multiple_predict_proba(self):
# First 27 elements the predictions are the same as the truth
num = 27
decimals = 5
X, y = self._get_Xy()
yp = self._clf.predict_proba(X[:num, :])
self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
expected_proba = [0.88395641, 0.36746962, 0.84158767, 0.34106833, 0.14269291, 0.85193236,
0.29876058, 0.7282164, 0.85958616, 0.89517877, 0.99745224, 0.18860349,
0.30756427, 0.8318412, 0.18981198, 0.15564624, 0.25740655, 0.22923355,
0.87365959, 0.49928689, 0.95574351, 0.28761257, 0.28906333, 0.32643692,
0.29788483, 0.01657364, 0.81149083]
self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())
0.29876058, 0.7282164, 0.85958616, 0.89517877, 0.99745224, 0.18860349,
0.30756427, 0.8318412, 0.18981198, 0.15564624, 0.25740655, 0.22923355,
0.87365959, 0.49928689, 0.95574351, 0.28761257, 0.28906333, 0.32643692,
0.29788483, 0.01657364, 0.81149083]
expected = np.round(expected_proba, decimals=decimals).tolist()
computed = np.round(yp[:, 1], decimals=decimals).tolist()
for i in range(len(expected)):
self.assertAlmostEqual(expected[i], computed[i], decimals)
def build_models(self):
"""Build and train two models, model_clf will use the sklearn classifier to
@@ -169,9 +178,9 @@ class Stree_test(unittest.TestCase):
coefficients to compute both predictions and splitted data
"""
model_clf = Stree(random_state=self._random_state,
use_predictions=True)
use_predictions=True)
model_computed = Stree(random_state=self._random_state,
use_predictions=False)
use_predictions=False)
X, y = self._get_Xy()
model_clf.fit(X, y)
model_computed.fit(X, y)
@@ -186,13 +195,13 @@ class Stree_test(unittest.TestCase):
use_clf.predict(X).tolist(),
use_math.predict(X).tolist()
)
def test_use_model_score(self):
use_clf, use_math, X, y = self.build_models()
b = use_math.score(X, y)
self.assertEqual(
use_clf.score(X, y),
b
b
)
self.assertGreater(b, .95)
@@ -217,7 +226,88 @@ class Stree_test(unittest.TestCase):
#
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
def test_iterator(self):
"""Check preorder iterator
"""
expected = [
'root',
'root - Down',
'root - Down - Down, <cgaf> - Leaf class=1 belief=0.975989 counts=(array([0, 1]), array([ 17, 691]))',
'root - Down - Up',
'root - Down - Up - Down, <cgaf> - Leaf class=1 belief=0.750000 counts=(array([0, 1]), array([1, 3]))',
'root - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([7]))',
'root - Up, <cgaf> - Leaf class=0 belief=0.928297 counts=(array([0, 1]), array([725, 56]))',
]
computed = []
for node in self._clf:
computed.append(str(node))
self.assertListEqual(expected, computed)
class Snode_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
os.environ['TESTING'] = '1'
self._random_state = 1
self._clf = Stree(random_state=self._random_state,
use_predictions=True)
self._clf.fit(*self._get_Xy())
super().__init__(*args, **kwargs)
@classmethod
def tearDownClass(cls):
try:
os.environ.pop('TESTING')
except:
pass
def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
return X, y
def test_attributes_in_leaves(self):
"""Check if the attributes in leaves have correct values so they form a predictor
"""
def check_leave(node: Snode):
if not node.is_leaf():
check_leave(node.get_down())
check_leave(node.get_up())
return
# Check Belief in leave
classes, card = np.unique(node._y, return_counts=True)
max_card = max(card)
min_card = min(card)
if len(classes) > 1:
try:
belief = max_card / (max_card + min_card)
except:
belief = 0.
else:
belief = 1
self.assertEqual(belief, node._belief)
# Check Class
class_computed = classes[card == max_card]
self.assertEqual(class_computed, node._class)
check_leave(self._clf._tree)
def test_nodes_coefs(self):
"""Check if the nodes of the tree have the right attributes filled
"""
def run_tree(node: Snode):
if node._belief < 1:
# only exclude pure leaves
self.assertIsNotNone(node._clf)
self.assertIsNotNone(node._clf.coef_)
self.assertIsNotNone(node._vector)
self.assertIsNotNone(node._interceptor)
if node.is_leaf():
return
run_tree(node.get_down())
run_tree(node.get_up())
run_tree(self._clf._tree)

1
stree/tests/__init__.py Normal file
View File

@@ -0,0 +1 @@
from .Strees_test import Stree_test, Snode_test

File diff suppressed because one or more lines are too long

View File

@@ -6,13 +6,10 @@
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
"from trees.Stree import Stree\n",
"import time"
"#\n",
"# Google Colab setup\n",
"#\n",
"#!pip install git+https://github.com/doctorado-ml/stree"
]
},
{
@@ -20,6 +17,22 @@
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
"from sklearn.model_selection import train_test_split\n",
"from stree import Stree\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.isfile('data/creditcard.csv'):\n",
@@ -29,20 +42,16 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 32.976% 492\nValid: 67.024% 1000\n"
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.110% 494\nValid: 66.890% 998\n"
}
],
"source": [
"import time\n",
"from sklearn.model_selection import train_test_split\n",
"from trees.Stree import Stree\n",
"\n",
"random_state=1\n",
"\n",
"def load_creditcard(n_examples=0):\n",
@@ -84,20 +93,13 @@
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9550\nClassifier's accuracy (test) : 0.9487\nroot\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=1 belief=0.977346 counts=(array([0, 1]), array([ 7, 302]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.945280 counts=(array([0, 1]), array([691, 40]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9569\nClassifier's accuracy (test) : 0.9576\nroot\nroot - Down, <cgaf> - Leaf class=1 belief=0.986971 counts=(array([0, 1]), array([ 4, 303]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.944369 counts=(array([0, 1]), array([696, 41]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9674\nClassifier's accuracy (test) : 0.9554\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([310]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.953232 counts=(array([0, 1]), array([693, 34]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([7]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9693\nClassifier's accuracy (test) : 0.9487\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([310]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([7]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.955494 counts=(array([0, 1]), array([687, 32]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9780\nClassifier's accuracy (test) : 0.9487\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([301]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Down - Up\nroot - Down - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([15]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([15]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.967468 counts=(array([0, 1]), array([684, 23]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\n\n**************************************************\n0.7277 secs\n"
"text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9521\nClassifier's accuracy (test) : 0.9598\nroot\nroot - Down, <cgaf> - Leaf class=1 belief=0.980519 counts=(array([0, 1]), array([ 6, 302]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.940217 counts=(array([0, 1]), array([692, 44]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9521\nClassifier's accuracy (test) : 0.9643\nroot\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=1 belief=0.986842 counts=(array([0, 1]), array([ 4, 300]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.937754 counts=(array([0, 1]), array([693, 46]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9636\nClassifier's accuracy (test) : 0.9688\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([308]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([8]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.947802 counts=(array([0, 1]), array([690, 38]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9665\nClassifier's accuracy (test) : 0.9621\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([308]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([11]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.951456 counts=(array([0, 1]), array([686, 35]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9741\nClassifier's accuracy (test) : 0.9576\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([306]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([10]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([7]))\nroot - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([4]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.961538 counts=(array([0, 1]), array([675, 27]))\n\n**************************************************\n0.7816 secs\n"
}
],
"source": [
@@ -115,7 +117,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -138,10 +140,11 @@
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([301]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Down - Up\nroot - Down - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([15]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([15]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.967468 counts=(array([0, 1]), array([684, 23]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\n"
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([306]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([10]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([7]))\nroot - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([4]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.961538 counts=(array([0, 1]), array([675, 27]))\n"
}
],
"source": [
"#check iterator\n",
"for i in list(clf):\n",
" print(i)"
]
@@ -154,75 +157,14 @@
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([301]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Down - Up\nroot - Down - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([15]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([15]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.967468 counts=(array([0, 1]), array([684, 23]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\n"
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([306]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([10]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([7]))\nroot - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([4]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.961538 counts=(array([0, 1]), array([675, 27]))\n"
}
],
"source": [
"#check iterator again\n",
"for i in clf:\n",
" print(i)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "0025f832c1734afc944021e5990c2d11"
}
},
"metadata": {}
}
],
"source": [
"%matplotlib widget\n",
"from mpl_toolkits.mplot3d import Axes3D\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib import cm\n",
"from matplotlib.ticker import LinearLocator, FormatStrFormatter\n",
"import numpy as np\n",
"\n",
"fig = plt.figure()\n",
"ax = fig.gca(projection='3d')\n",
"\n",
"scale = 8\n",
"# Make data.\n",
"X = np.arange(-scale, scale, 0.25)\n",
"Y = np.arange(-scale, scale, 0.25)\n",
"X, Y = np.meshgrid(X, Y)\n",
"Z = X**2 + Y**2\n",
"\n",
"# Plot the surface.\n",
"surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,\n",
" linewidth=0, antialiased=False)\n",
"\n",
"# Customize the z axis.\n",
"ax.set_zlim(0, 100)\n",
"ax.zaxis.set_major_locator(LinearLocator(10))\n",
"ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))\n",
"\n",
"# rotate the axes and update\n",
"#for angle in range(0, 360):\n",
"# ax.view_init(30, 40)\n",
"\n",
"# Add a color bar which maps values to colors.\n",
"fig.colorbar(surf, shrink=0.5, aspect=5)\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

261
test_graphs.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@@ -1,72 +0,0 @@
import os
import unittest
import numpy as np
from sklearn.datasets import make_classification
from trees.Stree import Stree, Snode
class Snode_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
os.environ['TESTING'] = '1'
self._random_state = 1
self._clf = Stree(random_state=self._random_state,
use_predictions=True)
self._clf.fit(*self._get_Xy())
super(Snode_test, self).__init__(*args, **kwargs)
@classmethod
def tearDownClass(cls):
try:
os.environ.pop('TESTING')
except:
pass
def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
return X, y
def test_attributes_in_leaves(self):
"""Check if the attributes in leaves have correct values so they form a predictor
"""
def check_leave(node: Snode):
if not node.is_leaf():
check_leave(node.get_down())
check_leave(node.get_up())
return
# Check Belief in leave
classes, card = np.unique(node._y, return_counts=True)
max_card = max(card)
min_card = min(card)
if len(classes) > 1:
try:
belief = max_card / (max_card + min_card)
except:
belief = 0.
else:
belief = 1
self.assertEqual(belief, node._belief)
# Check Class
class_computed = classes[card == max_card]
self.assertEqual(class_computed, node._class)
check_leave(self._clf._tree)
def test_nodes_coefs(self):
"""Check if the nodes of the tree have the right attributes filled
"""
def run_tree(node: Snode):
if node._belief < 1:
# only exclude pure leaves
self.assertIsNotNone(node._clf)
self.assertIsNotNone(node._clf.coef_)
self.assertIsNotNone(node._vector)
self.assertIsNotNone(node._interceptor)
if node.is_leaf():
return
run_tree(node.get_down())
run_tree(node.get_up())
run_tree(self._clf._tree)

View File

View File

@@ -1,34 +0,0 @@
'''
__author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
__license__ = "MIT"
__version__ = "0.9"
Inorder iterator for the binary tree of Snodes
Uses LinearSVC
'''
from trees.Snode import Snode
class Siterator:
"""Inorder iterator
"""
def __init__(self, tree: Snode):
self._stack = []
self._push(tree)
def __iter__(self):
return self
def _push(self, node: Snode):
while (node is not None):
self._stack.insert(0, node)
node = node.get_down()
def __next__(self) -> Snode:
if len(self._stack) == 0:
raise StopIteration()
node = self._stack.pop()
self._push(node.get_up())
return node

View File

@@ -1,70 +0,0 @@
'''
__author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
__license__ = "MIT"
__version__ = "0.9"
Node of the Stree (binary tree)
'''
import os
import numpy as np
from sklearn.svm import LinearSVC
class Snode:
def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str):
self._clf = clf
self._vector = None if clf is None else clf.coef_
self._interceptor = 0. if clf is None else clf.intercept_
self._title = title
self._belief = 0. # belief of the prediction in a leaf node based on samples
# Only store dataset in Testing
self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None
self._y = y
self._down = None
self._up = None
self._class = None
def set_down(self, son):
self._down = son
def set_up(self, son):
self._up = son
def is_leaf(self,) -> bool:
return self._up is None and self._down is None
def get_down(self) -> 'Snode':
return self._down
def get_up(self) -> 'Snode':
return self._up
def make_predictor(self):
"""Compute the class of the predictor and its belief based on the subdataset of the node
only if it is a leaf
"""
# Clean memory
#self._X = None
#self._y = None
if not self.is_leaf():
return
classes, card = np.unique(self._y, return_counts=True)
if len(classes) > 1:
max_card = max(card)
min_card = min(card)
try:
self._belief = max_card / (max_card + min_card)
except:
self._belief = 0.
self._class = classes[card == max_card][0]
else:
self._belief = 1
self._class = classes[0]
def __str__(self) -> str:
if self.is_leaf():
return f"{self._title} - Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}"
else:
return f"{self._title}"

View File