From c4de782a3f2f0c61c963731ef669b29c50b509bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Wed, 13 May 2020 00:12:05 +0200
Subject: [PATCH] compute predictor and store model in node

---
 .gitignore          |   2 -
 data/.gitignore     |   2 +
 main.py             |  10 ++--
 test.ipynb          | 133 ++++++++++++++++++++++++++++++++++++++++++--
 tests/Snode_test.py |  45 +++++++++++++++
 tests/Stree_test.py |  34 +++++------
 trees/Snode.py      |  44 ++++++++++-----
 trees/Stree.py      |  61 ++++++++++++--------
 8 files changed, 263 insertions(+), 68 deletions(-)
 create mode 100644 data/.gitignore
 create mode 100644 tests/Snode_test.py

diff --git a/.gitignore b/.gitignore
index 343d3fc..ae603c4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,6 +129,4 @@ dmypy.json
 .pyre/
 
 .idea
-data/*
-
 .vscode
\ No newline at end of file
diff --git a/data/.gitignore b/data/.gitignore
new file mode 100644
index 0000000..63400aa
--- /dev/null
+++ b/data/.gitignore
@@ -0,0 +1,2 @@
+*.csv
+*.txt
\ No newline at end of file
diff --git a/main.py b/main.py
index a88c6e5..d0f0944 100644
--- a/main.py
+++ b/main.py
@@ -2,10 +2,10 @@ from trees.Stree import Stree
 from sklearn.datasets import make_classification
 
 random_state = 1
-X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, 
-                    n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
-                    class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=random_state)
+X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
+                           n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
+                           class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=random_state)
 model = Stree(random_state=random_state)
 model.fit(X, y)
-model.show_outcomes()
-model.save_sub_datasets()
\ No newline at end of file
+print(model)
+model.save_sub_datasets()
diff --git a/test.ipynb b/test.ipynb
index dc1d2c8..abcde1f 100644
--- a/test.ipynb
+++ b/test.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -98,9 +98,134 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "[LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n          intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n          verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n          intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n          verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n          intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n          verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n          intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n          verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n          intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n          verbose=0)]\n"
+    }
+   ],
+   "source": [
+    "from sklearn.svm import LinearSVC\n",
+    "\n",
+    "data = []\n",
+    "for i in range(5):\n",
+    "    model = LinearSVC()\n",
+    "    data.append(model)\n",
+    "\n",
+    "print(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "4\n"
+    },
+    {
+     "output_type": "error",
+     "ename": "NameError",
+     "evalue": "name 'gato' is not defined",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-3-04351d05a6f0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpato\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgato\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m: name 'gato' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "def pato(k):\n",
+    "    def gato(m, u):\n",
+    "        return m * u\n",
+    "    return gato(k, k)\n",
+    "\n",
+    "print(pato(2))\n",
+    "print(gato(3,4))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "7\n"
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    a= max(5,3)/min(0,1)\n",
+    "except:\n",
+    "    a=7\n",
+    "print(a)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "error",
+     "ename": "SyntaxError",
+     "evalue": "invalid syntax (<ipython-input-6-65e24c447a24>, line 1)",
+     "traceback": [
+      "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-6-65e24c447a24>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m    max([2 5])\u001b[0m\n\u001b[0m           ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
+     ]
+    }
+   ],
+   "source": [
+    "max([2 5])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "y=[1,2,4,5,5,5,5,3,3,3,2,]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a,b = np.unique(y, return_counts=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": "11"
+     },
+     "metadata": {},
+     "execution_count": 12
+    }
+   ],
+   "source": [
+    "np.count_nonzero(y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
    "source": []
   }
  ],
@@ -119,8 +244,8 @@
   },
   "orig_nbformat": 2,
   "kernelspec": {
-   "name": "python37664bitstreevenva9e4a4efdc1042b6b577bd15fbe145ee",
-   "display_name": "Python 3.7.6 64-bit ('stree': venv)"
+   "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
+   "display_name": "Python 3.7.6 64-bit ('general': venv)"
   }
  },
  "nbformat": 4,
diff --git a/tests/Snode_test.py b/tests/Snode_test.py
new file mode 100644
index 0000000..34c1620
--- /dev/null
+++ b/tests/Snode_test.py
@@ -0,0 +1,45 @@
+import unittest
+
+from sklearn.datasets import make_classification
+import numpy as np
+import csv
+
+from trees.Stree import Stree, Snode
+
+
+class Snode_test(unittest.TestCase):
+
+    def __init__(self, *args, **kwargs):
+        self._random_state = 1
+        self._model = Stree(random_state=self._random_state,
+                            use_predictions=True)
+        self._model.fit(*self._get_Xy())
+        super(Snode_test, self).__init__(*args, **kwargs)
+
+    def _get_Xy(self):
+        X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
+                                   n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
+                                   class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
+        return X, y
+
+    def test_attributes_in_leaves(self):
+        """Check if the attributes in leaves have correct values so they form a predictor
+        """
+        def check_leave(node: Snode):
+            if node.is_leaf():
+                # Check Belief
+                classes, card = np.unique(node._y, return_counts=True)
+                max_card = max(card)
+                min_card = min(card)
+                try:
+                    accuracy = max_card / min_card
+                except:
+                    accuracy = 0
+                self.assertEqual(accuracy, node._belief)
+                # Check Class
+                class_computed = classes[card == max_card]
+                self.assertEqual(class_computed, node._class)
+                return
+            check_leave(node.get_down())
+            check_leave(node.get_up())
+        check_leave(self._model._tree)
diff --git a/tests/Stree_test.py b/tests/Stree_test.py
index 088d3fd..e56bb9a 100644
--- a/tests/Stree_test.py
+++ b/tests/Stree_test.py
@@ -1,35 +1,31 @@
 import unittest
 
-from sklearn.svm import LinearSVC
 from sklearn.datasets import make_classification
 import numpy as np
 import csv
 
 from trees.Stree import Stree, Snode
 
+
 class Stree_test(unittest.TestCase):
 
     def __init__(self, *args, **kwargs):
         self._random_state = 1
-        self._model_tree = Stree(random_state=self._random_state, use_predictions=True)
-        self._model_tree.fit(*self._get_Xy())
-        self._model_svm = LinearSVC(random_state=self._random_state, max_iter=self._model_tree._max_iter)
+        self._model = Stree(random_state=self._random_state,
+                            use_predictions=True)
+        self._model.fit(*self._get_Xy())
         super(Stree_test, self).__init__(*args, **kwargs)
 
     def _get_Xy(self):
-        X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, 
-                                n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
-                                class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=self._random_state)
+        X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
+                                   n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
+                                   class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
         return X, y
-    
-    def test_split_data(self):
-        self.assertTrue(True)
 
     def _check_tree(self, node: Snode):
         if node.is_leaf():
             return
-        self._model_svm.fit(node._X, node._y)
-        y_prediction = self._model_svm.predict(node._X)
+        y_prediction = node._model.predict(node._X)
         y_down = node.get_down()._y
         y_up = node.get_up()._y
         # Is a correct partition in terms of cadinality?
@@ -59,7 +55,7 @@ class Stree_test(unittest.TestCase):
     def test_build_tree(self):
         """Check if the tree is built the same way as predictions of models
         """
-        self._check_tree(self._model_tree._tree)
+        self._check_tree(self._model._tree)
 
     def _get_file_data(self, file_name: str) -> tuple:
         """Return X, y from data, y is the last column in array
@@ -69,7 +65,7 @@ class Stree_test(unittest.TestCase):
 
         Returns:
             tuple -- tuple with samples, categories
-        """        
+        """
         data = np.genfromtxt(file_name, delimiter=',')
         data = np.array(data)
         column_y = data.shape[1] - 1
@@ -87,22 +83,22 @@ class Stree_test(unittest.TestCase):
 
         Returns:
             np.array -- classes of the given samples
-        """        
+        """
         res = []
         for needle in px:
             for row in range(x_original.shape[0]):
                 if all(x_original[row, :] == needle):
                     res.append(y_original[row])
         return res
-        
+
     def test_subdatasets(self):
         """Check if the subdatasets files have the same predictions as the tree itself
         """
-        model = LinearSVC(random_state=self._random_state, max_iter=self._model_tree._max_iter)
+        model = self._model._tree._model
         X, y = self._get_Xy()
         model.fit(X, y)
-        self._model_tree.save_sub_datasets()
-        with open(self._model_tree.get_catalog_name()) as cat_file:
+        self._model.save_sub_datasets()
+        with open(self._model.get_catalog_name()) as cat_file:
             catalog = csv.reader(cat_file, delimiter=',')
             for row in catalog:
                 X, y = self._get_Xy()
diff --git a/trees/Snode.py b/trees/Snode.py
index 2591c26..70f0070 100644
--- a/trees/Snode.py
+++ b/trees/Snode.py
@@ -3,47 +3,63 @@ __author__ = "Ricardo Montañana Gómez"
 __copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
 __license__ = "MIT"
 __version__ = "1.0"
-Node of the Stree
+Node of the Stree (binary tree)
 '''
 
 import numpy as np
+from sklearn.svm import LinearSVC
+
 
 class Snode:
-    def __init__(self, vector: np.ndarray, interceptor: float, X: np.ndarray, y: np.ndarray, title: str):
-        self._vector = vector
-        self._interceptor = interceptor
+    def __init__(self, model: LinearSVC, X: np.ndarray, y: np.ndarray, title: str):
+        self._model = model
+        self._vector = None if model is None else model.coef_
+        self._interceptor = 0 if model is None else model.intercept_
         self._title = title
+        self._belief = 0  # belief of the prediction in a leaf node based on samples
         self._X = X
         self._y = y
         self._down = None
         self._up = None
-        self._class = None # really needed?
-    
+        self._class = None  # really needed?
+
     def set_down(self, son):
         self._down = son
-    
+
     def set_up(self, son):
         self._up = son
 
     def is_leaf(self,) -> bool:
         return self._up is None and self._down is None
-    
+
     def get_down(self) -> 'Snode':
         return self._down
 
     def get_up(self) -> 'Snode':
         return self._up
 
+    def make_predictor(self):
+        """Compute the class of the predictor and its belief based on the subdataset of the node
+        only if it is a leaf
+        """
+        if not self.is_leaf():
+            return
+        classes, card = np.unique(self._y, return_counts=True)
+        max_card = max(card)
+        min_card = min(card)
+        try:
+            self._belief = max_card / min_card
+        except:
+            self._belief = 0
+        self._class = classes[card == max_card]
+
     def __str__(self) -> str:
         if self.is_leaf():
             num = 0
             for i in np.unique(self._y):
                 num = max(num, self._y[self._y == i].shape[0])
             den = self._y.shape[0]
-            accuracy = num / den if den != 0 else 1       
-            return f"{self._title} LEAF accuracy={accuracy:.2f}"
+            accuracy = num / den if den != 0 else 1
+            return f"{self._title} LEAF accuracy={accuracy:.2f}\n"
         else:
-            return self._title
-
-    
-    
\ No newline at end of file
+            return f"{self._title}\n"
diff --git a/trees/Stree.py b/trees/Stree.py
index ab623eb..da40707 100644
--- a/trees/Stree.py
+++ b/trees/Stree.py
@@ -13,10 +13,12 @@ from sklearn.svm import LinearSVC
 
 from trees.Snode import Snode
 
+
 class Stree:
     """
     """
-    def __init__(self, max_iter: int=1000, random_state: int=0, use_predictions: bool=False):
+
+    def __init__(self, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False):
         self._max_iter = max_iter
         self._random_state = random_state
         self._outcomes = None
@@ -44,34 +46,48 @@ class Stree:
 
     def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
         self._tree = self.train(X, y, title)
+        self._predictor()
         return self
 
+    def _predictor(self):
+        """Process the leaves to make them predictors
+        """
+        def run_tree(node: Snode):
+            if node.is_leaf():
+                node.make_predictor()
+                return
+            run_tree(node.get_down())
+            run_tree(node.get_up())
+        run_tree(self._tree)
+
     def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
         if np.unique(y).shape[0] == 1:
             # only 1 class => pure dataset
-            return Snode(np.array([]), 0, X, y, title + f', class={np.unique(y)}, items={y.shape[0]}, rest=0,  <pure> ')
+            return Snode(None, X, y, title + f', class={np.unique(y)}, items={y.shape[0]}, rest=0,  <pure> ')
         # Train the model
-        clf = LinearSVC(max_iter=self._max_iter, random_state=self._random_state)
+        clf = LinearSVC(max_iter=self._max_iter,
+                        random_state=self._random_state)
         clf.fit(X, y)
-        tree = Snode(clf.coef_, clf.intercept_, X, y, title)
+        tree = Snode(clf, X, y, title)
         X_U, y_u, X_D, y_d = self._split_data(clf, X, y)
         if X_U is None or X_D is None:
             # didn't part anything
-            return Snode(clf.coef_, clf.intercept_, X, y, title + f', classes={np.unique(y)}, items<0>={y[y==0].shape[0]}, items<1>={y[y==1].shape[0]}, <couldn\'t go any further>')
-        tree.set_up(self.train(X_U, y_u, title + ' - Up' + str(np.unique(y_u, return_counts=True))))
-        tree.set_down(self.train(X_D, y_d, title + ' - Down' + str(np.unique(y_d, return_counts=True))))
+            return Snode(clf, X, y, title + f', classes={np.unique(y)}, items<0>={y[y==0].shape[0]}, items<1>={y[y==1].shape[0]}, <couldn\'t go any further>')
+        tree.set_up(self.train(X_U, y_u, title + ' - Up' +
+                               str(np.unique(y_u, return_counts=True))))
+        tree.set_down(self.train(X_D, y_d, title + ' - Down' +
+                                 str(np.unique(y_d, return_counts=True))))
         return tree
 
-    def _print_tree(self, tree: Snode):
-        print(tree)
-        if tree.is_leaf():
-            return
-        self._print_tree(tree.get_down())
-        self._print_tree(tree.get_up())
-    
-    def show_outcomes(self):
-        pointer = self._tree
-        self._print_tree(pointer)
+    def __str__(self):
+        def print_tree(tree: Snode) -> str:
+            output = str(tree)
+            if tree.is_leaf():
+                return output
+            output += print_tree(tree.get_down())
+            output += print_tree(tree.get_up())
+            return output
+        return print_tree(self._tree)
 
     def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int):
         """Save the dataset of the node in a csv file
@@ -80,10 +96,10 @@ class Stree:
             tree {Snode} -- node with data to save
             number {int} -- a number to make different file names
         """
-        data = np.append(tree._X, tree._y.reshape(-1,1), axis=1)
+        data = np.append(tree._X, tree._y.reshape(-1, 1), axis=1)
         name = f"{self.__folder}dataset{number}.csv"
         np.savetxt(name, data, delimiter=",")
-        catalog.write(f"{name}, - {str(tree)}\n")
+        catalog.write(f"{name}, - {str(tree)}")
         if tree.is_leaf():
             return
         self._save_datasets(tree.get_down(), catalog, number + 1)
@@ -95,8 +111,5 @@ class Stree:
     def save_sub_datasets(self):
         """Save the every dataset stored in the tree to check with manual classifier
         """
-        pointer = self._tree
-        with open(self.get_catalog_name(), 'w', encoding = 'utf-8') as catalog:
-            self._save_datasets(pointer, catalog, 1)
-
-
+        with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog:
+            self._save_datasets(self._tree, catalog, 1)