From 371257c1211cdd59b8d5c95d65632f8d88f8c4a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Tue, 12 May 2020 17:36:16 +0200
Subject: [PATCH] Implement split data with or without using predictions & some
 tests

---
 .gitignore            |   5 ++
 .vscode/settings.json |  12 ----
 main.py               |   1 +
 test.ipynb            | 128 ++++++++++++++++++++++++++++++++++++++++++
 tests/Stree_test.py   | 105 ++++++++++++++++++++++++++++++++--
 trees/Snode.py        |   8 +--
 trees/Stree.py        |  70 ++++++++++++++++-------
 7 files changed, 290 insertions(+), 39 deletions(-)
 delete mode 100644 .vscode/settings.json
 create mode 100644 test.ipynb
diff --git a/.gitignore b/.gitignore
index b6e4761..343d3fc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -127,3 +127,8 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+.idea
+data/*
+
+.vscode
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
deleted file mode 100644
index e3694a1..0000000
--- a/.vscode/settings.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "python.testing.unittestArgs": [
-        "-v",
-        "-s",
-        "./tests",
-        "-p",
-        "*_test.py"
-    ],
-    "python.testing.pytestEnabled": false,
-    "python.testing.nosetestsEnabled": false,
-    "python.testing.unittestEnabled": true
-}
\ No newline at end of file
diff --git a/main.py b/main.py
index 29f14dc..a88c6e5 100644
--- a/main.py
+++ b/main.py
@@ -8,3 +8,4 @@ X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
 model = Stree(random_state=random_state)
 model.fit(X, y)
 model.show_outcomes()
+model.save_sub_datasets()
\ No newline at end of file
diff --git a/test.ipynb b/test.ipynb
new file mode 100644
index 0000000..dc1d2c8
--- /dev/null
+++ b/test.ipynb
@@ -0,0 +1,128 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np \n",
+    "from sklearn.svm import LinearSVC\n",
+    "from sklearn.datasets import make_classification\n",
+    "\n",
+    "random_state = 1\n",
+    "X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n",
+    "                    n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,\n",
+    "                    class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=random_state)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "data/dataset1.csv - root\ndata/dataset2.csv - root - Down\ndata/dataset3.csv - root - Down - Down, classes=[0 1], items<0>=17, items<1>=691, <couldn't go any further> LEAF accuracy=0.98\ndata/dataset4.csv - root - Down - Up\ndata/dataset5.csv - root - Down - Up - Down, classes=[0 1], items<0>=1, items<1>=3, <couldn't go any further> LEAF accuracy=0.75\ndata/dataset6.csv - root - Down - Up - Up, class=[0], items=7, rest=0,  <pure>  LEAF accuracy=1.00\ndata/dataset3.csv - root - Up, classes=[0 1], items<0>=725, items<1>=56, <couldn't go any further> LEAF accuracy=0.93\n"
+    }
+   ],
+   "source": [
+    "!cat data/catalog.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def readsub(name):\n",
+    "    data = np.genfromtxt(name, delimiter=',')\n",
+    "    data = np.array(data)\n",
+    "    py = data[:, data.shape[1] - 1]\n",
+    "    px = np.delete(data, data.shape[1] - 1, axis=1)\n",
+    "    return px, py\n",
+    "def localiza(X, px):\n",
+    "    enc = False\n",
+    "    for i in range(X.shape[0]):\n",
+    "        if all(X[i, :] == px):\n",
+    "            enc = True\n",
+    "            print(f\" i={i} - X[{i}, :]={X[i, :]} - px={px} - y[{i}]={y[i]}\")\n",
+    "    print(\"Encontrado:\", enc)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "px, py = readsub('data/dataset5.csv')\n",
+    "model = LinearSVC(random_state=1, max_iter=1000)\n",
+    "model.fit(px,py)\n",
+    "yp = model.predict(px)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "[1. 1. 1. 1.]\n[1. 1. 0. 1.]\n"
+    }
+   ],
+   "source": [
+    "print(yp)\n",
+    "print(py)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "i=1132 - X[1132, :]=[-0.41453617 -0.38206564  0.54849331] - px=[-0.41453617 -0.38206564  0.54849331] - y[1132]=0\nEncontrado: True\n"
+    }
+   ],
+   "source": [
+    "localiza(X, px[2, :])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6-final"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python37664bitstreevenva9e4a4efdc1042b6b577bd15fbe145ee",
+   "display_name": "Python 3.7.6 64-bit ('stree': venv)"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/tests/Stree_test.py b/tests/Stree_test.py
index c4210af..088d3fd 100644
--- a/tests/Stree_test.py
+++ b/tests/Stree_test.py
@@ -1,14 +1,111 @@
 import unittest
 
-from trees.Stree import Stree
+from sklearn.svm import LinearSVC
+from sklearn.datasets import make_classification
+import numpy as np
+import csv
+
+from trees.Stree import Stree, Snode
 
 class Stree_test(unittest.TestCase):
-    
+
     def __init__(self, *args, **kwargs):
-        self.random_state = 17
-        self._model = Stree(random_state=self.random_state)
+        self._random_state = 1
+        self._model_tree = Stree(random_state=self._random_state, use_predictions=True)
+        self._model_tree.fit(*self._get_Xy())
+        self._model_svm = LinearSVC(random_state=self._random_state, max_iter=self._model_tree._max_iter)
         super(Stree_test, self).__init__(*args, **kwargs)
+
+    def _get_Xy(self):
+        X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, 
+                                n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
+                                class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=self._random_state)
+        return X, y
     
     def test_split_data(self):
         self.assertTrue(True)
 
+    def _check_tree(self, node: Snode):
+        if node.is_leaf():
+            return
+        self._model_svm.fit(node._X, node._y)
+        y_prediction = self._model_svm.predict(node._X)
+        y_down = node.get_down()._y
+        y_up = node.get_up()._y
+        # Is a correct partition in terms of cadinality?
+        # i.e. The partition algorithm didn't forget any sample
+        self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
+        unique_y, count_y = np.unique(node._y, return_counts=True)
+        _, count_d = np.unique(y_down, return_counts=True)
+        _, count_u = np.unique(y_up, return_counts=True)
+        for i in unique_y:
+            try:
+                number_down = count_d[i]
+            except:
+                number_down = 0
+            try:
+                number_up = count_u[i]
+            except:
+                number_up = 0
+            self.assertEqual(count_y[i], number_down + number_up)
+        # Is the partition made the same as the prediction?
+        # as the node is not a leaf...
+        unique_yp, count_yp = np.unique(y_prediction, return_counts=True)
+        self.assertEqual(count_yp[1], y_down.shape[0])
+        self.assertEqual(count_yp[0], y_up.shape[0])
+        self._check_tree(node.get_down())
+        self._check_tree(node.get_up())
+
+    def test_build_tree(self):
+        """Check if the tree is built the same way as predictions of models
+        """
+        self._check_tree(self._model_tree._tree)
+
+    def _get_file_data(self, file_name: str) -> tuple:
+        """Return X, y from data, y is the last column in array
+
+        Arguments:
+            file_name {str} -- the file name
+
+        Returns:
+            tuple -- tuple with samples, categories
+        """        
+        data = np.genfromtxt(file_name, delimiter=',')
+        data = np.array(data)
+        column_y = data.shape[1] - 1
+        fy = data[:, column_y]
+        fx = np.delete(data, column_y, axis=1)
+        return fx, fy
+
+    def _find_out(self, px: np.array, x_original: np.array, y_original) -> list:
+        """Find the original values of y for a given array of samples
+
+        Arguments:
+            px {np.array} -- array of samples to search for
+            x_original {np.array} -- original dataset
+            y_original {[type]} -- original classes
+
+        Returns:
+            np.array -- classes of the given samples
+        """        
+        res = []
+        for needle in px:
+            for row in range(x_original.shape[0]):
+                if all(x_original[row, :] == needle):
+                    res.append(y_original[row])
+        return res
+        
+    def test_subdatasets(self):
+        """Check if the subdatasets files have the same predictions as the tree itself
+        """
+        model = LinearSVC(random_state=self._random_state, max_iter=self._model_tree._max_iter)
+        X, y = self._get_Xy()
+        model.fit(X, y)
+        self._model_tree.save_sub_datasets()
+        with open(self._model_tree.get_catalog_name()) as cat_file:
+            catalog = csv.reader(cat_file, delimiter=',')
+            for row in catalog:
+                X, y = self._get_Xy()
+                x_file, y_file = self._get_file_data(row[0])
+                y_original = np.array(self._find_out(x_file, X, y), dtype=int)
+                self.assertTrue(np.array_equal(y_file, y_original))
diff --git a/trees/Snode.py b/trees/Snode.py
index bb5ab35..2591c26 100644
--- a/trees/Snode.py
+++ b/trees/Snode.py
@@ -17,7 +17,7 @@ class Snode:
         self._y = y
         self._down = None
         self._up = None
-        self._class = None
+        self._class = None # really needed?
     
     def set_down(self, son):
         self._down = son
@@ -28,13 +28,13 @@ class Snode:
     def is_leaf(self,) -> bool:
         return self._up is None and self._down is None
     
-    def get_down(self):
+    def get_down(self) -> 'Snode':
         return self._down
 
-    def get_up(self):
+    def get_up(self) -> 'Snode':
         return self._up
 
-    def __str__(self):
+    def __str__(self) -> str:
         if self.is_leaf():
             num = 0
             for i in np.unique(self._y):
diff --git a/trees/Stree.py b/trees/Stree.py
index 50b6efb..ab623eb 100644
--- a/trees/Stree.py
+++ b/trees/Stree.py
@@ -8,6 +8,7 @@ Uses LinearSVC
 '''
 
 import numpy as np
+import typing
 from sklearn.svm import LinearSVC
 
 from trees.Snode import Snode
@@ -15,45 +16,50 @@ from trees.Snode import Snode
 class Stree:
     """
     """
-    def __init__(self, max_iter: int=1000, random_state: int=0):
+    def __init__(self, max_iter: int=1000, random_state: int=0, use_predictions: bool=False):
         self._max_iter = max_iter
         self._random_state = random_state
         self._outcomes = None
         self._tree = None
+        self.__folder = 'data/'
+        self.__use_predictions = use_predictions
 
     def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list:
-        # doesn't work with multiclass as each sample has to do inner product with its own coeficients
-        # computes positition of every sample is w.r.t. the hyperplane
-        coef = clf.coef_[0, :].reshape(-1, X.shape[1])
-        intercept = clf.intercept_[0]
-        res = X.dot(coef.T) + intercept
-        down = res > 0
+        if self.__use_predictions:
+            yp = clf.predict(X)
+            down = (yp == 1).reshape(-1, 1)
+        else:
+            # doesn't work with multiclass as each sample has to do inner product with its own coeficients
+            # computes positition of every sample is w.r.t. the hyperplane
+            coef = clf.coef_[0, :].reshape(-1, X.shape[1])
+            intercept = clf.intercept_[0]
+            res = X.dot(coef.T) + intercept
+            down = res > 0
         up = ~down
         X_down = X[down[:, 0]] if any(down) else None
         y_down = y[down[:, 0]] if any(down) else None
         X_up = X[up[:, 0]] if any(up) else None
         y_up = y[up[:, 0]] if any(up) else None
-        return X_up, y_up, X_down, y_down
+        return [X_up, y_up, X_down, y_down]
 
-    def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> list:
+    def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
         self._tree = self.train(X, y, title)
         return self
-    
-    def train(self: Snode, X: np.ndarray, y: np.ndarray, title: str='') -> list:
+
+    def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
         if np.unique(y).shape[0] == 1:
-            # onlyt 1 class => pure dataset
-            return Snode(np.array([]), 0, X, y, title + f', <pure> class={np.unique(y)} items={y.shape[0]}')
+            # only 1 class => pure dataset
+            return Snode(np.array([]), 0, X, y, title + f', class={np.unique(y)}, items={y.shape[0]}, rest=0,  <pure> ')
         # Train the model
         clf = LinearSVC(max_iter=self._max_iter, random_state=self._random_state)
         clf.fit(X, y)
         tree = Snode(clf.coef_, clf.intercept_, X, y, title)
-        #plot_hyperplane(clf, X, y, title)
-        X_T, y_t, X_O, y_o = self._split_data(clf, X, y)
-        if X_T is None or X_O is None:
+        X_U, y_u, X_D, y_d = self._split_data(clf, X, y)
+        if X_U is None or X_D is None:
             # didn't part anything
-            return Snode(clf.coef_, clf.intercept_, X, y, title + f', <couldn\'t go any further> classes={np.unique(y)} items<0>={y[y==0].shape[0]} items<1>={y[y==1].shape[0]}')
-        tree.set_up( self.train(X_T, y_t, title + ' - Up'))
-        tree.set_down(self.train(X_O, y_o, title + ' - Down'))
+            return Snode(clf.coef_, clf.intercept_, X, y, title + f', classes={np.unique(y)}, items<0>={y[y==0].shape[0]}, items<1>={y[y==1].shape[0]}, <couldn\'t go any further>')
+        tree.set_up(self.train(X_U, y_u, title + ' - Up' + str(np.unique(y_u, return_counts=True))))
+        tree.set_down(self.train(X_D, y_d, title + ' - Down' + str(np.unique(y_d, return_counts=True))))
         return tree
 
     def _print_tree(self, tree: Snode):
@@ -67,4 +73,30 @@ class Stree:
         pointer = self._tree
         self._print_tree(pointer)
 
+    def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int):
+        """Save the dataset of the node in a csv file
+
+        Arguments:
+            tree {Snode} -- node with data to save
+            number {int} -- a number to make different file names
+        """
+        data = np.append(tree._X, tree._y.reshape(-1,1), axis=1)
+        name = f"{self.__folder}dataset{number}.csv"
+        np.savetxt(name, data, delimiter=",")
+        catalog.write(f"{name}, - {str(tree)}\n")
+        if tree.is_leaf():
+            return
+        self._save_datasets(tree.get_down(), catalog, number + 1)
+        self._save_datasets(tree.get_up(), catalog, number + 2)
+
+    def get_catalog_name(self):
+        return self.__folder + "catalog.txt"
+
+    def save_sub_datasets(self):
+        """Save the every dataset stored in the tree to check with manual classifier
+        """
+        pointer = self._tree
+        with open(self.get_catalog_name(), 'w', encoding = 'utf-8') as catalog:
+            self._save_datasets(pointer, catalog, 1)
+