From c4de782a3f2f0c61c963731ef669b29c50b509bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Wed, 13 May 2020 00:12:05 +0200 Subject: [PATCH] compute predictor and store model in node --- .gitignore | 2 - data/.gitignore | 2 + main.py | 10 ++-- test.ipynb | 133 ++++++++++++++++++++++++++++++++++++++++++-- tests/Snode_test.py | 45 +++++++++++++++ tests/Stree_test.py | 34 +++++------ trees/Snode.py | 44 ++++++++++----- trees/Stree.py | 61 ++++++++++++-------- 8 files changed, 263 insertions(+), 68 deletions(-) create mode 100644 data/.gitignore create mode 100644 tests/Snode_test.py diff --git a/.gitignore b/.gitignore index 343d3fc..ae603c4 100644 --- a/.gitignore +++ b/.gitignore @@ -129,6 +129,4 @@ dmypy.json .pyre/ .idea -data/* - .vscode \ No newline at end of file diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..63400aa --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,2 @@ +*.csv +*.txt \ No newline at end of file diff --git a/main.py b/main.py index a88c6e5..d0f0944 100644 --- a/main.py +++ b/main.py @@ -2,10 +2,10 @@ from trees.Stree import Stree from sklearn.datasets import make_classification random_state = 1 -X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, - n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, - class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=random_state) +X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, + n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, + class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=random_state) model = Stree(random_state=random_state) model.fit(X, y) -model.show_outcomes() -model.save_sub_datasets() \ No newline at end of file +print(model) +model.save_sub_datasets() diff --git a/test.ipynb b/test.ipynb index dc1d2c8..abcde1f 100644 --- a/test.ipynb +++ b/test.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -98,9 +98,134 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "[LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)]\n" + } + ], + "source": [ + "from sklearn.svm import LinearSVC\n", + "\n", + "data = []\n", + "for i in range(5):\n", + " model = LinearSVC()\n", + " data.append(model)\n", + "\n", + "print(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "4\n" + }, + { + "output_type": "error", + "ename": "NameError", + "evalue": "name 'gato' is not defined", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpato\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgato\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'gato' is not defined" + ] + } + ], + "source": [ + "def pato(k):\n", + " def gato(m, u):\n", + " return m * u\n", + " return gato(k, k)\n", + "\n", + "print(pato(2))\n", + "print(gato(3,4))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "7\n" + } + ], + "source": [ + "try:\n", + " a= max(5,3)/min(0,1)\n", + "except:\n", + " a=7\n", + "print(a)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "error", + "ename": "SyntaxError", + "evalue": "invalid syntax (, line 1)", + "traceback": [ + "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m max([2 5])\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" + ] + } + ], + "source": [ + "max([2 5])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": {}, "outputs": [], + "source": [ + "y=[1,2,4,5,5,5,5,3,3,3,2,]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "a,b = np.unique(y, return_counts=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "11" + }, + "metadata": {}, + "execution_count": 12 + } + ], + "source": [ + "np.count_nonzero(y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [] } ], @@ -119,8 +244,8 @@ }, "orig_nbformat": 2, "kernelspec": { - "name": "python37664bitstreevenva9e4a4efdc1042b6b577bd15fbe145ee", - "display_name": "Python 3.7.6 64-bit ('stree': venv)" + "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39", + "display_name": "Python 3.7.6 64-bit ('general': venv)" } }, "nbformat": 4, diff --git a/tests/Snode_test.py b/tests/Snode_test.py new file mode 100644 index 0000000..34c1620 --- /dev/null +++ b/tests/Snode_test.py @@ -0,0 +1,45 @@ +import unittest + +from sklearn.datasets import make_classification +import numpy as np +import csv + +from trees.Stree import Stree, Snode + + +class Snode_test(unittest.TestCase): + + def __init__(self, *args, **kwargs): + self._random_state = 1 + self._model = Stree(random_state=self._random_state, + use_predictions=True) + self._model.fit(*self._get_Xy()) + super(Snode_test, self).__init__(*args, **kwargs) + + def _get_Xy(self): + X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, + n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, + class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state) + return X, y + + def test_attributes_in_leaves(self): + """Check if the attributes in leaves have correct values so they form a predictor + """ + def check_leave(node: Snode): + if node.is_leaf(): + # Check Belief + classes, card = np.unique(node._y, return_counts=True) + max_card = max(card) + min_card = min(card) + try: + accuracy = max_card / min_card + except: + accuracy = 0 + self.assertEqual(accuracy, node._belief) + # Check Class + class_computed = classes[card == max_card] + self.assertEqual(class_computed, node._class) + return + check_leave(node.get_down()) + check_leave(node.get_up()) + check_leave(self._model._tree) diff --git a/tests/Stree_test.py b/tests/Stree_test.py index 088d3fd..e56bb9a 100644 --- a/tests/Stree_test.py +++ b/tests/Stree_test.py @@ -1,35 +1,31 @@ import unittest -from sklearn.svm import LinearSVC from sklearn.datasets import make_classification import numpy as np import csv from trees.Stree import Stree, Snode + class Stree_test(unittest.TestCase): def __init__(self, *args, **kwargs): self._random_state = 1 - self._model_tree = Stree(random_state=self._random_state, use_predictions=True) - self._model_tree.fit(*self._get_Xy()) - self._model_svm = LinearSVC(random_state=self._random_state, max_iter=self._model_tree._max_iter) + self._model = Stree(random_state=self._random_state, + use_predictions=True) + self._model.fit(*self._get_Xy()) super(Stree_test, self).__init__(*args, **kwargs) def _get_Xy(self): - X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, - n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, - class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=self._random_state) + X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, + n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, + class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state) return X, y - - def test_split_data(self): - self.assertTrue(True) def _check_tree(self, node: Snode): if node.is_leaf(): return - self._model_svm.fit(node._X, node._y) - y_prediction = self._model_svm.predict(node._X) + y_prediction = node._model.predict(node._X) y_down = node.get_down()._y y_up = node.get_up()._y # Is a correct partition in terms of cadinality? @@ -59,7 +55,7 @@ class Stree_test(unittest.TestCase): def test_build_tree(self): """Check if the tree is built the same way as predictions of models """ - self._check_tree(self._model_tree._tree) + self._check_tree(self._model._tree) def _get_file_data(self, file_name: str) -> tuple: """Return X, y from data, y is the last column in array @@ -69,7 +65,7 @@ class Stree_test(unittest.TestCase): Returns: tuple -- tuple with samples, categories - """ + """ data = np.genfromtxt(file_name, delimiter=',') data = np.array(data) column_y = data.shape[1] - 1 @@ -87,22 +83,22 @@ class Stree_test(unittest.TestCase): Returns: np.array -- classes of the given samples - """ + """ res = [] for needle in px: for row in range(x_original.shape[0]): if all(x_original[row, :] == needle): res.append(y_original[row]) return res - + def test_subdatasets(self): """Check if the subdatasets files have the same predictions as the tree itself """ - model = LinearSVC(random_state=self._random_state, max_iter=self._model_tree._max_iter) + model = self._model._tree._model X, y = self._get_Xy() model.fit(X, y) - self._model_tree.save_sub_datasets() - with open(self._model_tree.get_catalog_name()) as cat_file: + self._model.save_sub_datasets() + with open(self._model.get_catalog_name()) as cat_file: catalog = csv.reader(cat_file, delimiter=',') for row in catalog: X, y = self._get_Xy() diff --git a/trees/Snode.py b/trees/Snode.py index 2591c26..70f0070 100644 --- a/trees/Snode.py +++ b/trees/Snode.py @@ -3,47 +3,63 @@ __author__ = "Ricardo Montañana Gómez" __copyright__ = "Copyright 2020, Ricardo Montañana Gómez" __license__ = "MIT" __version__ = "1.0" -Node of the Stree +Node of the Stree (binary tree) ''' import numpy as np +from sklearn.svm import LinearSVC + class Snode: - def __init__(self, vector: np.ndarray, interceptor: float, X: np.ndarray, y: np.ndarray, title: str): - self._vector = vector - self._interceptor = interceptor + def __init__(self, model: LinearSVC, X: np.ndarray, y: np.ndarray, title: str): + self._model = model + self._vector = None if model is None else model.coef_ + self._interceptor = 0 if model is None else model.intercept_ self._title = title + self._belief = 0 # belief of the prediction in a leaf node based on samples self._X = X self._y = y self._down = None self._up = None - self._class = None # really needed? - + self._class = None # really needed? + def set_down(self, son): self._down = son - + def set_up(self, son): self._up = son def is_leaf(self,) -> bool: return self._up is None and self._down is None - + def get_down(self) -> 'Snode': return self._down def get_up(self) -> 'Snode': return self._up + def make_predictor(self): + """Compute the class of the predictor and its belief based on the subdataset of the node + only if it is a leaf + """ + if not self.is_leaf(): + return + classes, card = np.unique(self._y, return_counts=True) + max_card = max(card) + min_card = min(card) + try: + self._belief = max_card / min_card + except: + self._belief = 0 + self._class = classes[card == max_card] + def __str__(self) -> str: if self.is_leaf(): num = 0 for i in np.unique(self._y): num = max(num, self._y[self._y == i].shape[0]) den = self._y.shape[0] - accuracy = num / den if den != 0 else 1 - return f"{self._title} LEAF accuracy={accuracy:.2f}" + accuracy = num / den if den != 0 else 1 + return f"{self._title} LEAF accuracy={accuracy:.2f}\n" else: - return self._title - - - \ No newline at end of file + return f"{self._title}\n" diff --git a/trees/Stree.py b/trees/Stree.py index ab623eb..da40707 100644 --- a/trees/Stree.py +++ b/trees/Stree.py @@ -13,10 +13,12 @@ from sklearn.svm import LinearSVC from trees.Snode import Snode + class Stree: """ """ - def __init__(self, max_iter: int=1000, random_state: int=0, use_predictions: bool=False): + + def __init__(self, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False): self._max_iter = max_iter self._random_state = random_state self._outcomes = None @@ -44,34 +46,48 @@ class Stree: def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree': self._tree = self.train(X, y, title) + self._predictor() return self + def _predictor(self): + """Process the leaves to make them predictors + """ + def run_tree(node: Snode): + if node.is_leaf(): + node.make_predictor() + return + run_tree(node.get_down()) + run_tree(node.get_up()) + run_tree(self._tree) + def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode: if np.unique(y).shape[0] == 1: # only 1 class => pure dataset - return Snode(np.array([]), 0, X, y, title + f', class={np.unique(y)}, items={y.shape[0]}, rest=0, ') + return Snode(None, X, y, title + f', class={np.unique(y)}, items={y.shape[0]}, rest=0, ') # Train the model - clf = LinearSVC(max_iter=self._max_iter, random_state=self._random_state) + clf = LinearSVC(max_iter=self._max_iter, + random_state=self._random_state) clf.fit(X, y) - tree = Snode(clf.coef_, clf.intercept_, X, y, title) + tree = Snode(clf, X, y, title) X_U, y_u, X_D, y_d = self._split_data(clf, X, y) if X_U is None or X_D is None: # didn't part anything - return Snode(clf.coef_, clf.intercept_, X, y, title + f', classes={np.unique(y)}, items<0>={y[y==0].shape[0]}, items<1>={y[y==1].shape[0]}, ') - tree.set_up(self.train(X_U, y_u, title + ' - Up' + str(np.unique(y_u, return_counts=True)))) - tree.set_down(self.train(X_D, y_d, title + ' - Down' + str(np.unique(y_d, return_counts=True)))) + return Snode(clf, X, y, title + f', classes={np.unique(y)}, items<0>={y[y==0].shape[0]}, items<1>={y[y==1].shape[0]}, ') + tree.set_up(self.train(X_U, y_u, title + ' - Up' + + str(np.unique(y_u, return_counts=True)))) + tree.set_down(self.train(X_D, y_d, title + ' - Down' + + str(np.unique(y_d, return_counts=True)))) return tree - def _print_tree(self, tree: Snode): - print(tree) - if tree.is_leaf(): - return - self._print_tree(tree.get_down()) - self._print_tree(tree.get_up()) - - def show_outcomes(self): - pointer = self._tree - self._print_tree(pointer) + def __str__(self): + def print_tree(tree: Snode) -> str: + output = str(tree) + if tree.is_leaf(): + return output + output += print_tree(tree.get_down()) + output += print_tree(tree.get_up()) + return output + return print_tree(self._tree) def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int): """Save the dataset of the node in a csv file @@ -80,10 +96,10 @@ class Stree: tree {Snode} -- node with data to save number {int} -- a number to make different file names """ - data = np.append(tree._X, tree._y.reshape(-1,1), axis=1) + data = np.append(tree._X, tree._y.reshape(-1, 1), axis=1) name = f"{self.__folder}dataset{number}.csv" np.savetxt(name, data, delimiter=",") - catalog.write(f"{name}, - {str(tree)}\n") + catalog.write(f"{name}, - {str(tree)}") if tree.is_leaf(): return self._save_datasets(tree.get_down(), catalog, number + 1) @@ -95,8 +111,5 @@ class Stree: def save_sub_datasets(self): """Save the every dataset stored in the tree to check with manual classifier """ - pointer = self._tree - with open(self.get_catalog_name(), 'w', encoding = 'utf-8') as catalog: - self._save_datasets(pointer, catalog, 1) - - + with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog: + self._save_datasets(self._tree, catalog, 1)