Implement predict and score methods & tests

2025-08-15 23:46:02 +00:00 · 2020-05-13 12:42:09 +02:00
parent c4de782a3f
commit 8f71eeb316
6 changed files with 70 additions and 221 deletions
--- a/main.py
+++ b/main.py
@@ -9,3 +9,4 @@ model = Stree(random_state=random_state)
 model.fit(X, y)
 print(model)
 model.save_sub_datasets()
+print(f"Prediciting [{y[0]}] we have {model.predict(X[0, :].reshape(-1, X.shape[1]))}")
--- a/test.ipynb
+++ b/test.ipynb
@@ -9,6 +9,7 @@
    "import numpy as np \n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.datasets import make_classification\n",
+    "from trees.Stree import Stree\n",
    "\n",
    "random_state = 1\n",
    "X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n",
@@ -18,215 +19,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
-     "text": "data/dataset1.csv - root\ndata/dataset2.csv - root - Down\ndata/dataset3.csv - root - Down - Down, classes=[0 1], items<0>=17, items<1>=691, <couldn't go any further> LEAF accuracy=0.98\ndata/dataset4.csv - root - Down - Up\ndata/dataset5.csv - root - Down - Up - Down, classes=[0 1], items<0>=1, items<1>=3, <couldn't go any further> LEAF accuracy=0.75\ndata/dataset6.csv - root - Down - Up - Up, class=[0], items=7, rest=0,  <pure>  LEAF accuracy=1.00\ndata/dataset3.csv - root - Up, classes=[0 1], items<0>=725, items<1>=56, <couldn't go any further> LEAF accuracy=0.93\n"
+     "text": "Accuracy: 0.950667\n"
    }
   ],
   "source": [
-    "!cat data/catalog.txt"
+    "clf = Stree(random_state=random_state, use_predictions=False)\n",
+    "clf.fit(X, y)\n",
+    "accuracy = clf.score(X, y)"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def readsub(name):\n",
-    "    data = np.genfromtxt(name, delimiter=',')\n",
-    "    data = np.array(data)\n",
-    "    py = data[:, data.shape[1] - 1]\n",
-    "    px = np.delete(data, data.shape[1] - 1, axis=1)\n",
-    "    return px, py\n",
-    "def localiza(X, px):\n",
-    "    enc = False\n",
-    "    for i in range(X.shape[0]):\n",
-    "        if all(X[i, :] == px):\n",
-    "            enc = True\n",
-    "            print(f\" i={i} - X[{i}, :]={X[i, :]} - px={px} - y[{i}]={y[i]}\")\n",
-    "    print(\"Encontrado:\", enc)\n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "px, py = readsub('data/dataset5.csv')\n",
-    "model = LinearSVC(random_state=1, max_iter=1000)\n",
-    "model.fit(px,py)\n",
-    "yp = model.predict(px)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": "[1. 1. 1. 1.]\n[1. 1. 0. 1.]\n"
-    }
-   ],
-   "source": [
-    "print(yp)\n",
-    "print(py)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": "i=1132 - X[1132, :]=[-0.41453617 -0.38206564  0.54849331] - px=[-0.41453617 -0.38206564  0.54849331] - y[1132]=0\nEncontrado: True\n"
-    }
-   ],
-   "source": [
-    "localiza(X, px[2, :])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": "[LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n          intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n          verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n          intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n          verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n          intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n          verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n          intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n          verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n          intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n          verbose=0)]\n"
-    }
-   ],
-   "source": [
-    "from sklearn.svm import LinearSVC\n",
-    "\n",
-    "data = []\n",
-    "for i in range(5):\n",
-    "    model = LinearSVC()\n",
-    "    data.append(model)\n",
-    "\n",
-    "print(data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": "4\n"
-    },
-    {
-     "output_type": "error",
-     "ename": "NameError",
-     "evalue": "name 'gato' is not defined",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-3-04351d05a6f0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpato\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgato\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m: name 'gato' is not defined"
-     ]
-    }
-   ],
-   "source": [
-    "def pato(k):\n",
-    "    def gato(m, u):\n",
-    "        return m * u\n",
-    "    return gato(k, k)\n",
-    "\n",
-    "print(pato(2))\n",
-    "print(gato(3,4))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": "7\n"
-    }
-   ],
-   "source": [
-    "try:\n",
-    "    a= max(5,3)/min(0,1)\n",
-    "except:\n",
-    "    a=7\n",
-    "print(a)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "error",
-     "ename": "SyntaxError",
-     "evalue": "invalid syntax (<ipython-input-6-65e24c447a24>, line 1)",
-     "traceback": [
-      "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-6-65e24c447a24>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m    max([2 5])\u001b[0m\n\u001b[0m           ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
-     ]
-    }
-   ],
-   "source": [
-    "max([2 5])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "y=[1,2,4,5,5,5,5,3,3,3,2,]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "a,b = np.unique(y, return_counts=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": "11"
-     },
-     "metadata": {},
-     "execution_count": 12
-    }
-   ],
-   "source": [
-    "np.count_nonzero(y)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
  }
 ],
 "metadata": {
--- a/tests/Snode_test.py
+++ b/tests/Snode_test.py
@@ -11,9 +11,9 @@ class Snode_test(unittest.TestCase):

    def __init__(self, *args, **kwargs):
        self._random_state = 1
-        self._model = Stree(random_state=self._random_state,
+        self._clf = Stree(random_state=self._random_state,
                            use_predictions=True)
-        self._model.fit(*self._get_Xy())
+        self._clf.fit(*self._get_Xy())
        super(Snode_test, self).__init__(*args, **kwargs)

    def _get_Xy(self):
@@ -42,4 +42,4 @@ class Snode_test(unittest.TestCase):
                return
            check_leave(node.get_down())
            check_leave(node.get_up())
-        check_leave(self._model._tree)
+        check_leave(self._clf._tree)
--- a/tests/Stree_test.py
+++ b/tests/Stree_test.py
@@ -11,9 +11,9 @@ class Stree_test(unittest.TestCase):

    def __init__(self, *args, **kwargs):
        self._random_state = 1
-        self._model = Stree(random_state=self._random_state,
-                            use_predictions=True)
-        self._model.fit(*self._get_Xy())
+        self._clf = Stree(random_state=self._random_state,
+                            use_predictions=False)
+        self._clf.fit(*self._get_Xy())
        super(Stree_test, self).__init__(*args, **kwargs)

    def _get_Xy(self):
@@ -25,7 +25,7 @@ class Stree_test(unittest.TestCase):
    def _check_tree(self, node: Snode):
        if node.is_leaf():
            return
-        y_prediction = node._model.predict(node._X)
+        y_prediction = node._clf.predict(node._X)
        y_down = node.get_down()._y
        y_up = node.get_up()._y
        # Is a correct partition in terms of cadinality?
@@ -55,7 +55,7 @@ class Stree_test(unittest.TestCase):
    def test_build_tree(self):
        """Check if the tree is built the same way as predictions of models
        """
-        self._check_tree(self._model._tree)
+        self._check_tree(self._clf._tree)

    def _get_file_data(self, file_name: str) -> tuple:
        """Return X, y from data, y is the last column in array
@@ -94,14 +94,32 @@ class Stree_test(unittest.TestCase):
    def test_subdatasets(self):
        """Check if the subdatasets files have the same predictions as the tree itself
        """
-        model = self._model._tree._model
+        model = self._clf._tree._clf
        X, y = self._get_Xy()
        model.fit(X, y)
-        self._model.save_sub_datasets()
-        with open(self._model.get_catalog_name()) as cat_file:
+        self._clf.save_sub_datasets()
+        with open(self._clf.get_catalog_name()) as cat_file:
            catalog = csv.reader(cat_file, delimiter=',')
            for row in catalog:
                X, y = self._get_Xy()
                x_file, y_file = self._get_file_data(row[0])
                y_original = np.array(self._find_out(x_file, X, y), dtype=int)
                self.assertTrue(np.array_equal(y_file, y_original))
+    
+    def test_single_prediction(self):
+        X, y = self._get_Xy()
+        yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
+        self.assertEqual(yp[0], y[0])
+
+    def test_multiple_prediction(self):
+        X, y = self._get_Xy()
+        yp = self._clf.predict(X[:23, :])
+        self.assertListEqual(y[:23].tolist(), yp.tolist())
+
+    def test_score(self):
+        X, y = self._get_Xy()
+        accuracy_score = self._clf.score(X, y, print_out=False)
+        yp = self._clf.predict(X)
+        right = (yp == y).astype(int)
+        accuracy_computed = sum(right) / len(y)
+        self.assertEqual(accuracy_score, accuracy_computed)
--- a/trees/Snode.py
+++ b/trees/Snode.py
@@ -2,7 +2,7 @@
 __author__ = "Ricardo Montañana Gómez"
 __copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
 __license__ = "MIT"
-__version__ = "1.0"
+__version__ = "0.9"
 Node of the Stree (binary tree)
 '''

@@ -11,10 +11,10 @@ from sklearn.svm import LinearSVC


 class Snode:
-    def __init__(self, model: LinearSVC, X: np.ndarray, y: np.ndarray, title: str):
-        self._model = model
-        self._vector = None if model is None else model.coef_
-        self._interceptor = 0 if model is None else model.intercept_
+    def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str):
+        self._clf = clf
+        self._vector = None if clf is None else clf.coef_
+        self._interceptor = 0 if clf is None else clf.intercept_
        self._title = title
        self._belief = 0  # belief of the prediction in a leaf node based on samples
        self._X = X
@@ -60,6 +60,6 @@ class Snode:
                num = max(num, self._y[self._y == i].shape[0])
            den = self._y.shape[0]
            accuracy = num / den if den != 0 else 1
-            return f"{self._title} LEAF accuracy={accuracy:.2f}\n"
+            return f"{self._title} LEAF accuracy={accuracy:.2f}, belief={self._belief:.2f} class={self._class}\n"
        else:
            return f"{self._title}\n"
--- a/trees/Stree.py
+++ b/trees/Stree.py
@@ -2,8 +2,8 @@
 __author__ = "Ricardo Montañana Gómez"
 __copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
 __license__ = "MIT"
-__version__ = "1.0"
-Create a oblique tree classifier based on SVM Trees
+__version__ = "0.9"
+Build an oblique tree classifier based on SVM Trees
 Uses LinearSVC
 '''

@@ -25,6 +25,7 @@ class Stree:
        self._tree = None
        self.__folder = 'data/'
        self.__use_predictions = use_predictions
+        self.__trained = False

    def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list:
        if self.__use_predictions:
@@ -46,10 +47,11 @@ class Stree:

    def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
        self._tree = self.train(X, y, title)
-        self._predictor()
+        self._build_predictor()
+        self.__trained = True
        return self

-    def _predictor(self):
+    def _build_predictor(self):
        """Process the leaves to make them predictors
        """
        def run_tree(node: Snode):
@@ -79,6 +81,28 @@ class Stree:
                                 str(np.unique(y_d, return_counts=True))))
        return tree

+    def predict(self, X: np.array) -> np.array:
+        def predict_class(xp: np.array, tree: Snode) -> np.array:
+            if tree.is_leaf():
+                return tree._class
+            coef = tree._vector[0, :].reshape(-1, xp.shape[1])
+            if xp.dot(coef.T) + tree._interceptor[0] > 0:
+                return predict_class(xp, tree.get_down())
+            return predict_class(xp, tree.get_up())
+        y = np.array([], dtype=int)
+        for xp in X:
+            y = np.append(y, predict_class(xp.reshape(-1, X.shape[1]), self._tree))
+        return y
+
+    def score(self, X: np.array, y: np.array, print_out=True) -> float:
+        self.fit(X, y)
+        yp = self.predict(X)
+        right = (yp == y).astype(int)
+        accuracy = sum(right) / len(y)
+        if print_out:
+            print(f"Accuracy: {accuracy:.6f}")
+        return accuracy
+
    def __str__(self):
        def print_tree(tree: Snode) -> str:
            output = str(tree)