compute predictor and store model in node

This commit is contained in:
2020-05-13 00:12:05 +02:00
parent 371257c121
commit c4de782a3f
8 changed files with 263 additions and 68 deletions

2
.gitignore vendored
View File

@@ -129,6 +129,4 @@ dmypy.json
.pyre/ .pyre/
.idea .idea
data/*
.vscode .vscode

2
data/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
*.csv
*.txt

View File

@@ -7,5 +7,5 @@ X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=random_state) class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=random_state)
model = Stree(random_state=random_state) model = Stree(random_state=random_state)
model.fit(X, y) model.fit(X, y)
model.show_outcomes() print(model)
model.save_sub_datasets() model.save_sub_datasets()

View File

@@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -98,9 +98,134 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 1,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "[LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)]\n"
}
],
"source": [
"from sklearn.svm import LinearSVC\n",
"\n",
"data = []\n",
"for i in range(5):\n",
" model = LinearSVC()\n",
" data.append(model)\n",
"\n",
"print(data)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "4\n"
},
{
"output_type": "error",
"ename": "NameError",
"evalue": "name 'gato' is not defined",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-3-04351d05a6f0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpato\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgato\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'gato' is not defined"
]
}
],
"source": [
"def pato(k):\n",
" def gato(m, u):\n",
" return m * u\n",
" return gato(k, k)\n",
"\n",
"print(pato(2))\n",
"print(gato(3,4))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "7\n"
}
],
"source": [
"try:\n",
" a= max(5,3)/min(0,1)\n",
"except:\n",
" a=7\n",
"print(a)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "SyntaxError",
"evalue": "invalid syntax (<ipython-input-6-65e24c447a24>, line 1)",
"traceback": [
"\u001b[0;36m File \u001b[0;32m\"<ipython-input-6-65e24c447a24>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m max([2 5])\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
]
}
],
"source": [
"max([2 5])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [
"y=[1,2,4,5,5,5,5,3,3,3,2,]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"a,b = np.unique(y, return_counts=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "11"
},
"metadata": {},
"execution_count": 12
}
],
"source": [
"np.count_nonzero(y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [] "source": []
} }
], ],
@@ -119,8 +244,8 @@
}, },
"orig_nbformat": 2, "orig_nbformat": 2,
"kernelspec": { "kernelspec": {
"name": "python37664bitstreevenva9e4a4efdc1042b6b577bd15fbe145ee", "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
"display_name": "Python 3.7.6 64-bit ('stree': venv)" "display_name": "Python 3.7.6 64-bit ('general': venv)"
} }
}, },
"nbformat": 4, "nbformat": 4,

45
tests/Snode_test.py Normal file
View File

@@ -0,0 +1,45 @@
import unittest
from sklearn.datasets import make_classification
import numpy as np
import csv
from trees.Stree import Stree, Snode
class Snode_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
self._random_state = 1
self._model = Stree(random_state=self._random_state,
use_predictions=True)
self._model.fit(*self._get_Xy())
super(Snode_test, self).__init__(*args, **kwargs)
def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
return X, y
def test_attributes_in_leaves(self):
"""Check if the attributes in leaves have correct values so they form a predictor
"""
def check_leave(node: Snode):
if node.is_leaf():
# Check Belief
classes, card = np.unique(node._y, return_counts=True)
max_card = max(card)
min_card = min(card)
try:
accuracy = max_card / min_card
except:
accuracy = 0
self.assertEqual(accuracy, node._belief)
# Check Class
class_computed = classes[card == max_card]
self.assertEqual(class_computed, node._class)
return
check_leave(node.get_down())
check_leave(node.get_up())
check_leave(self._model._tree)

View File

@@ -1,19 +1,19 @@
import unittest import unittest
from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification from sklearn.datasets import make_classification
import numpy as np import numpy as np
import csv import csv
from trees.Stree import Stree, Snode from trees.Stree import Stree, Snode
class Stree_test(unittest.TestCase): class Stree_test(unittest.TestCase):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self._random_state = 1 self._random_state = 1
self._model_tree = Stree(random_state=self._random_state, use_predictions=True) self._model = Stree(random_state=self._random_state,
self._model_tree.fit(*self._get_Xy()) use_predictions=True)
self._model_svm = LinearSVC(random_state=self._random_state, max_iter=self._model_tree._max_iter) self._model.fit(*self._get_Xy())
super(Stree_test, self).__init__(*args, **kwargs) super(Stree_test, self).__init__(*args, **kwargs)
def _get_Xy(self): def _get_Xy(self):
@@ -22,14 +22,10 @@ class Stree_test(unittest.TestCase):
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state) class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
return X, y return X, y
def test_split_data(self):
self.assertTrue(True)
def _check_tree(self, node: Snode): def _check_tree(self, node: Snode):
if node.is_leaf(): if node.is_leaf():
return return
self._model_svm.fit(node._X, node._y) y_prediction = node._model.predict(node._X)
y_prediction = self._model_svm.predict(node._X)
y_down = node.get_down()._y y_down = node.get_down()._y
y_up = node.get_up()._y y_up = node.get_up()._y
# Is a correct partition in terms of cadinality? # Is a correct partition in terms of cadinality?
@@ -59,7 +55,7 @@ class Stree_test(unittest.TestCase):
def test_build_tree(self): def test_build_tree(self):
"""Check if the tree is built the same way as predictions of models """Check if the tree is built the same way as predictions of models
""" """
self._check_tree(self._model_tree._tree) self._check_tree(self._model._tree)
def _get_file_data(self, file_name: str) -> tuple: def _get_file_data(self, file_name: str) -> tuple:
"""Return X, y from data, y is the last column in array """Return X, y from data, y is the last column in array
@@ -98,11 +94,11 @@ class Stree_test(unittest.TestCase):
def test_subdatasets(self): def test_subdatasets(self):
"""Check if the subdatasets files have the same predictions as the tree itself """Check if the subdatasets files have the same predictions as the tree itself
""" """
model = LinearSVC(random_state=self._random_state, max_iter=self._model_tree._max_iter) model = self._model._tree._model
X, y = self._get_Xy() X, y = self._get_Xy()
model.fit(X, y) model.fit(X, y)
self._model_tree.save_sub_datasets() self._model.save_sub_datasets()
with open(self._model_tree.get_catalog_name()) as cat_file: with open(self._model.get_catalog_name()) as cat_file:
catalog = csv.reader(cat_file, delimiter=',') catalog = csv.reader(cat_file, delimiter=',')
for row in catalog: for row in catalog:
X, y = self._get_Xy() X, y = self._get_Xy()

View File

@@ -3,16 +3,20 @@ __author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez" __copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
__license__ = "MIT" __license__ = "MIT"
__version__ = "1.0" __version__ = "1.0"
Node of the Stree Node of the Stree (binary tree)
''' '''
import numpy as np import numpy as np
from sklearn.svm import LinearSVC
class Snode: class Snode:
def __init__(self, vector: np.ndarray, interceptor: float, X: np.ndarray, y: np.ndarray, title: str): def __init__(self, model: LinearSVC, X: np.ndarray, y: np.ndarray, title: str):
self._vector = vector self._model = model
self._interceptor = interceptor self._vector = None if model is None else model.coef_
self._interceptor = 0 if model is None else model.intercept_
self._title = title self._title = title
self._belief = 0 # belief of the prediction in a leaf node based on samples
self._X = X self._X = X
self._y = y self._y = y
self._down = None self._down = None
@@ -34,6 +38,21 @@ class Snode:
def get_up(self) -> 'Snode': def get_up(self) -> 'Snode':
return self._up return self._up
def make_predictor(self):
"""Compute the class of the predictor and its belief based on the subdataset of the node
only if it is a leaf
"""
if not self.is_leaf():
return
classes, card = np.unique(self._y, return_counts=True)
max_card = max(card)
min_card = min(card)
try:
self._belief = max_card / min_card
except:
self._belief = 0
self._class = classes[card == max_card]
def __str__(self) -> str: def __str__(self) -> str:
if self.is_leaf(): if self.is_leaf():
num = 0 num = 0
@@ -41,9 +60,6 @@ class Snode:
num = max(num, self._y[self._y == i].shape[0]) num = max(num, self._y[self._y == i].shape[0])
den = self._y.shape[0] den = self._y.shape[0]
accuracy = num / den if den != 0 else 1 accuracy = num / den if den != 0 else 1
return f"{self._title} LEAF accuracy={accuracy:.2f}" return f"{self._title} LEAF accuracy={accuracy:.2f}\n"
else: else:
return self._title return f"{self._title}\n"

View File

@@ -13,9 +13,11 @@ from sklearn.svm import LinearSVC
from trees.Snode import Snode from trees.Snode import Snode
class Stree: class Stree:
""" """
""" """
def __init__(self, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False): def __init__(self, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False):
self._max_iter = max_iter self._max_iter = max_iter
self._random_state = random_state self._random_state = random_state
@@ -44,34 +46,48 @@ class Stree:
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree': def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
self._tree = self.train(X, y, title) self._tree = self.train(X, y, title)
self._predictor()
return self return self
def _predictor(self):
"""Process the leaves to make them predictors
"""
def run_tree(node: Snode):
if node.is_leaf():
node.make_predictor()
return
run_tree(node.get_down())
run_tree(node.get_up())
run_tree(self._tree)
def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode: def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
if np.unique(y).shape[0] == 1: if np.unique(y).shape[0] == 1:
# only 1 class => pure dataset # only 1 class => pure dataset
return Snode(np.array([]), 0, X, y, title + f', class={np.unique(y)}, items={y.shape[0]}, rest=0, <pure> ') return Snode(None, X, y, title + f', class={np.unique(y)}, items={y.shape[0]}, rest=0, <pure> ')
# Train the model # Train the model
clf = LinearSVC(max_iter=self._max_iter, random_state=self._random_state) clf = LinearSVC(max_iter=self._max_iter,
random_state=self._random_state)
clf.fit(X, y) clf.fit(X, y)
tree = Snode(clf.coef_, clf.intercept_, X, y, title) tree = Snode(clf, X, y, title)
X_U, y_u, X_D, y_d = self._split_data(clf, X, y) X_U, y_u, X_D, y_d = self._split_data(clf, X, y)
if X_U is None or X_D is None: if X_U is None or X_D is None:
# didn't part anything # didn't part anything
return Snode(clf.coef_, clf.intercept_, X, y, title + f', classes={np.unique(y)}, items<0>={y[y==0].shape[0]}, items<1>={y[y==1].shape[0]}, <couldn\'t go any further>') return Snode(clf, X, y, title + f', classes={np.unique(y)}, items<0>={y[y==0].shape[0]}, items<1>={y[y==1].shape[0]}, <couldn\'t go any further>')
tree.set_up(self.train(X_U, y_u, title + ' - Up' + str(np.unique(y_u, return_counts=True)))) tree.set_up(self.train(X_U, y_u, title + ' - Up' +
tree.set_down(self.train(X_D, y_d, title + ' - Down' + str(np.unique(y_d, return_counts=True)))) str(np.unique(y_u, return_counts=True))))
tree.set_down(self.train(X_D, y_d, title + ' - Down' +
str(np.unique(y_d, return_counts=True))))
return tree return tree
def _print_tree(self, tree: Snode): def __str__(self):
print(tree) def print_tree(tree: Snode) -> str:
output = str(tree)
if tree.is_leaf(): if tree.is_leaf():
return return output
self._print_tree(tree.get_down()) output += print_tree(tree.get_down())
self._print_tree(tree.get_up()) output += print_tree(tree.get_up())
return output
def show_outcomes(self): return print_tree(self._tree)
pointer = self._tree
self._print_tree(pointer)
def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int): def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int):
"""Save the dataset of the node in a csv file """Save the dataset of the node in a csv file
@@ -83,7 +99,7 @@ class Stree:
data = np.append(tree._X, tree._y.reshape(-1, 1), axis=1) data = np.append(tree._X, tree._y.reshape(-1, 1), axis=1)
name = f"{self.__folder}dataset{number}.csv" name = f"{self.__folder}dataset{number}.csv"
np.savetxt(name, data, delimiter=",") np.savetxt(name, data, delimiter=",")
catalog.write(f"{name}, - {str(tree)}\n") catalog.write(f"{name}, - {str(tree)}")
if tree.is_leaf(): if tree.is_leaf():
return return
self._save_datasets(tree.get_down(), catalog, number + 1) self._save_datasets(tree.get_down(), catalog, number + 1)
@@ -95,8 +111,5 @@ class Stree:
def save_sub_datasets(self): def save_sub_datasets(self):
"""Save the every dataset stored in the tree to check with manual classifier """Save the every dataset stored in the tree to check with manual classifier
""" """
pointer = self._tree
with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog: with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog:
self._save_datasets(pointer, catalog, 1) self._save_datasets(self._tree, catalog, 1)