From e3ae3a3a6c601a9a67da9fe5ae7e741ee70aba57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Thu, 14 May 2020 10:48:39 +0200 Subject: [PATCH] Add C param in constructor and creditcard dataset --- main.py | 44 ++++- test.ipynb | 381 +++++++++++++++++++++++++++++++++++++++++--- tests/Snode_test.py | 34 ++-- trees/Snode.py | 29 ++-- trees/Stree.py | 50 +++--- 5 files changed, 465 insertions(+), 73 deletions(-) diff --git a/main.py b/main.py index 82d227d..f98e6ec 100644 --- a/main.py +++ b/main.py @@ -5,8 +5,42 @@ random_state = 1 X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=random_state) -model = Stree(random_state=random_state) -model.fit(X, y) -print(model) -model.save_sub_datasets() -print(f"Prediciting [{y[0]}] we have {model.predict(X[0, :].reshape(-1, X.shape[1]))}") + +def load_creditcard(n_examples=0): + import pandas as pd + import numpy as np + import random + df = pd.read_csv('data/creditcard.csv') + print("Fraud: {0:.3f}% {1}".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count())) + print("Valid: {0:.3f}% {1}".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count())) + y = np.expand_dims(df.Class.values, axis=1) + X = df.drop(['Class', 'Time', 'Amount'], axis=1).values + #Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y) + #return Xtrain, Xtest, ytrain, ytest + if n_examples > 0: + # Take first n_examples samples + X = X[:n_examples, :] + y = y[:n_examples, :] + else: + # Take all the positive samples with a number of random negatives + if n_examples < 0: + Xt = X[(y == 1).ravel()] + yt = y[(y == 1).ravel()] + indices = random.sample(range(X.shape[0]), -1 * n_examples) + X = np.append(Xt, X[indices], axis=0) + y = np.append(yt, y[indices], axis=0) + print("X.shape", X.shape, " y.shape", y.shape) + print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1]))) + print("Valid: {0:.3f}% {1}".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0]))) + return X, y +#X, y = load_creditcard(-5000) +#X, y = load_creditcard(0) + +clf = Stree(C=.01, max_iter=100, random_state=random_state) +clf.fit(X, y) +print(clf) +clf.show_tree() +clf.save_sub_datasets() +print(f"Predicting {y[0]} we have {clf.predict(X[0, :].reshape(-1, X.shape[1]))}") +print(f"Classifier's accuracy: {clf.score(X, y, print_out=False):.4f}") +clf.show_tree(only_leaves=True) diff --git a/test.ipynb b/test.ipynb index b19b4c2..9d9e0ad 100644 --- a/test.ipynb +++ b/test.ipynb @@ -1,22 +1,5 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np \n", - "from sklearn.svm import LinearSVC\n", - "from sklearn.datasets import make_classification\n", - "from trees.Stree import Stree\n", - "\n", - "random_state = 1\n", - "X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n", - " n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,\n", - " class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=random_state)" - ] - }, { "cell_type": "code", "execution_count": 2, @@ -25,14 +8,374 @@ { "output_type": "stream", "name": "stdout", - "text": "Accuracy: 0.950667\n" + "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nCPU times: user 2 µs, sys: 0 ns, total: 2 µs\nWall time: 5.96 µs\n" } ], "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.datasets import make_classification, load_iris, load_wine\n", + "from trees.Stree import Stree\n", + "\n", + "\n", + "def load_creditcard(n_examples=0):\n", + " df = pd.read_csv('data/creditcard.csv')\n", + " print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n", + " print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n", + " y = np.expand_dims(df.Class.values, axis=1)\n", + " X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n", + " #Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n", + " #return Xtrain, Xtest, ytrain, ytest\n", + " if n_examples > 0:\n", + " X = X[:n_examples, :]\n", + " y = y[:n_examples, :]\n", + " else:\n", + " if n_examples < 0:\n", + " Xt = X[(y == 1).ravel()]\n", + " yt = y[(y == 1).ravel()]\n", + " indices = random.sample(range(X.shape[0]), -1 * n_examples)\n", + " X = np.append(Xt, X[indices], axis=0)\n", + " y = np.append(yt, y[indices], axis=0)\n", + " print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n", + " print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n", + " print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))\n", + " return X, y\n", + "\n", + "random_state = 1\n", + "\n", + "# Datasets\n", + "\n", + "#X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n", + "# n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,\n", + "# class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=random_state)\n", + "\n", + "#X, y = load_wine(return_X_y=True)\n", + "#X, y = load_iris(return_X_y=True)\n", + "\n", + "X, y = load_creditcard(23000)\n", + "%time" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "CPU times: user 3 µs, sys: 0 ns, total: 3 µs\nWall time: 5.01 µs\nAccuracy: 0.999609\nroot\nroot - Down(array([0, 1]), array([24, 63]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Down(array([0, 1]), array([ 1, 61])), classes=[0 1], items<0>=1, items<1>=61, LEAF accuracy=0.98, belief=61.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Up(array([1]), array([1])), class=[1], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1]))\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Up(array([0]), array([23])), class=[0], items=23, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Down(array([1]), array([8])), class=[1], items=8, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Up(array([0]), array([3])), class=[0], items=3, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Down(array([1]), array([2])), class=[1], items=2, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Up(array([0]), array([2])), class=[0], items=2, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Down(array([1]), array([4])), class=[1], items=4, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Up(array([0]), array([1])), class=[0], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Up(array([0, 1]), array([22884, 8])), classes=[0 1], items<0>=22884, items<1>=8, LEAF accuracy=1.00, belief=2860.50 class=[0]\n\n" + } + ], + "source": [ + "%time\n", "clf = Stree(random_state=random_state, use_predictions=False)\n", "clf.fit(X, y)\n", - "accuracy = clf.score(X, y)" + "clf.score(X, y)\n", + "print(clf)" ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "CPU times: user 4 µs, sys: 5 µs, total: 9 µs\nWall time: 12.2 µs\n" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": "0.9979565217391304" + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "%time\n", + "clf2 = LinearSVC(random_state=random_state)\n", + "clf2.fit(X, y)\n", + "clf2.score(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "CPU times: user 13 µs, sys: 5 µs, total: 18 µs\nWall time: 7.87 µs\n" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": "1.0" + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "%time\n", + "clf3 = DecisionTreeClassifier(random_state=random_state)\n", + "clf3.fit(X, y)\n", + "clf3.score(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "root\nroot - Down(array([0, 1]), array([24, 63]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Down(array([0, 1]), array([ 1, 61])), classes=[0 1], items<0>=1, items<1>=61, LEAF accuracy=0.98, belief=61.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Up(array([1]), array([1])), class=[1], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1]))\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Up(array([0]), array([23])), class=[0], items=23, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Down(array([1]), array([8])), class=[1], items=8, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Up(array([0]), array([3])), class=[0], items=3, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Down(array([1]), array([2])), class=[1], items=2, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Up(array([0]), array([2])), class=[0], items=2, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Down(array([1]), array([4])), class=[1], items=4, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Up(array([0]), array([1])), class=[0], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Up(array([0, 1]), array([22884, 8])), classes=[0 1], items<0>=22884, items<1>=8, LEAF accuracy=1.00, belief=2860.50 class=[0]\n\n" + } + ], + "source": [ + "print(clf)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "22884 8\n" + } + ], + "source": [ + "a=[22884, 8]\n", + "b=max(a)\n", + "c=min(a)\n", + "print(b,c)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "0.9996505329372707" + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "b/(b+c)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(23000, 1)" + }, + "metadata": {}, + "execution_count": 10 + } + ], + "source": [ + "y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "k=y[:4,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(4, 1)" + }, + "metadata": {}, + "execution_count": 15 + } + ], + "source": [ + "k.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(4,)" + }, + "metadata": {}, + "execution_count": 17 + } + ], + "source": [ + "k.ravel().ravel().shape" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "array([[0],\n [0]])" + }, + "metadata": {}, + "execution_count": 20 + } + ], + "source": [ + "k[[True, False, True, False]]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(23000, 28)" + }, + "metadata": {}, + "execution_count": 21 + } + ], + "source": [ + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "k = X[(y==1).ravel()]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(86, 28)" + }, + "metadata": {}, + "execution_count": 29 + } + ], + "source": [ + "k.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "indices = np.random.random_integers(0, X.shape[0], 2000)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(2000, 28)" + }, + "metadata": {}, + "execution_count": 39 + } + ], + "source": [ + "X[indices].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "k=random.shuffle(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "k" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "[4, 9, 8, 6, 5, 1]" + }, + "metadata": {}, + "execution_count": 45 + } + ], + "source": [ + "random.sample(range(10), 6)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/tests/Snode_test.py b/tests/Snode_test.py index 3e57830..1d71713 100644 --- a/tests/Snode_test.py +++ b/tests/Snode_test.py @@ -26,20 +26,24 @@ class Snode_test(unittest.TestCase): """Check if the attributes in leaves have correct values so they form a predictor """ def check_leave(node: Snode): - if node.is_leaf(): - # Check Belief - classes, card = np.unique(node._y, return_counts=True) - max_card = max(card) - min_card = min(card) - try: - accuracy = max_card / min_card - except: - accuracy = 0 - self.assertEqual(accuracy, node._belief) - # Check Class - class_computed = classes[card == max_card] - self.assertEqual(class_computed, node._class) + if not node.is_leaf(): + check_leave(node.get_down()) + check_leave(node.get_up()) return - check_leave(node.get_down()) - check_leave(node.get_up()) + # Check Belief in leave + classes, card = np.unique(node._y, return_counts=True) + max_card = max(card) + min_card = min(card) + if len(classes) > 1: + try: + belief = max_card / (max_card + min_card) + except: + belief = 0. + else: + belief = 1 + self.assertEqual(belief, node._belief) + # Check Class + class_computed = classes[card == max_card] + self.assertEqual(class_computed, node._class) + check_leave(self._clf._tree) diff --git a/trees/Snode.py b/trees/Snode.py index 38b7a81..037ba6d 100644 --- a/trees/Snode.py +++ b/trees/Snode.py @@ -14,9 +14,9 @@ class Snode: def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str): self._clf = clf self._vector = None if clf is None else clf.coef_ - self._interceptor = 0 if clf is None else clf.intercept_ + self._interceptor = 0. if clf is None else clf.intercept_ self._title = title - self._belief = 0 # belief of the prediction in a leaf node based on samples + self._belief = 0. # belief of the prediction in a leaf node based on samples self._X = X self._y = y self._down = None @@ -45,21 +45,20 @@ class Snode: if not self.is_leaf(): return classes, card = np.unique(self._y, return_counts=True) - max_card = max(card) - min_card = min(card) - try: - self._belief = max_card / min_card - except: - self._belief = 0 - self._class = classes[card == max_card] + if len(classes) > 1: + max_card = max(card) + min_card = min(card) + try: + self._belief = max_card / (max_card + min_card) + except: + self._belief = 0. + self._class = classes[card == max_card][0] + else: + self._belief = 1 + self._class = classes[0] def __str__(self) -> str: if self.is_leaf(): - num = 0 - for i in np.unique(self._y): - num = max(num, self._y[self._y == i].shape[0]) - den = self._y.shape[0] - accuracy = num / den if den != 0 else 1 - return f"{self._title} LEAF accuracy={accuracy:.2f}, belief={self._belief:.2f} class={self._class}\n" + return f"Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}\n" else: return f"{self._title}\n" diff --git a/trees/Stree.py b/trees/Stree.py index 1b18802..b648ba1 100644 --- a/trees/Stree.py +++ b/trees/Stree.py @@ -18,8 +18,9 @@ class Stree: """ """ - def __init__(self, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False): + def __init__(self, C=1.0, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False): self._max_iter = max_iter + self._C = C self._random_state = random_state self._outcomes = None self._tree = None @@ -46,7 +47,7 @@ class Stree: return [X_up, y_up, X_down, y_down] def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree': - self._tree = self.train(X, y, title) + self._tree = self.train(X, y.ravel(), title) self._build_predictor() self.__trained = True return self @@ -65,20 +66,18 @@ class Stree: def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode: if np.unique(y).shape[0] == 1: # only 1 class => pure dataset - return Snode(None, X, y, title + f', class={np.unique(y)}, items={y.shape[0]}, rest=0, ') + return Snode(None, X, y, title + ', ') # Train the model - clf = LinearSVC(max_iter=self._max_iter, + clf = LinearSVC(max_iter=self._max_iter, C=self._C, random_state=self._random_state) clf.fit(X, y) tree = Snode(clf, X, y, title) X_U, y_u, X_D, y_d = self._split_data(clf, X, y) if X_U is None or X_D is None: # didn't part anything - return Snode(clf, X, y, title + f', classes={np.unique(y)}, items<0>={y[y==0].shape[0]}, items<1>={y[y==1].shape[0]}, ') - tree.set_up(self.train(X_U, y_u, title + ' - Up' + - str(np.unique(y_u, return_counts=True)))) - tree.set_down(self.train(X_D, y_d, title + ' - Down' + - str(np.unique(y_d, return_counts=True)))) + return Snode(clf, X, y, title + ', ') + tree.set_up(self.train(X_U, y_u, title + ' - Up')) + tree.set_down(self.train(X_D, y_d, title + ' - Down')) return tree def predict(self, X: np.array) -> np.array: @@ -95,23 +94,36 @@ class Stree: return y def score(self, X: np.array, y: np.array, print_out=True) -> float: - self.fit(X, y) - yp = self.predict(X) + if not self.__trained: + self.fit(X, y) + yp = self.predict(X).reshape(y.shape) right = (yp == y).astype(int) - accuracy = sum(right) / len(y) + accuracy = np.sum(right) / len(y) if print_out: print(f"Accuracy: {accuracy:.6f}") return accuracy - def __str__(self): - def print_tree(tree: Snode) -> str: + def __print_tree(self, tree: Snode, only_leaves=False) -> str: + if not only_leaves: output = str(tree) - if tree.is_leaf(): - return output - output += print_tree(tree.get_down()) - output += print_tree(tree.get_up()) + else: + output = '' + if tree.is_leaf(): + if only_leaves: + output = str(tree) return output - return print_tree(self._tree) + output += self.__print_tree(tree.get_down(), only_leaves) + output += self.__print_tree(tree.get_up(), only_leaves) + return output + + def show_tree(self, only_leaves=False): + if only_leaves: + print(self.__print_tree(self._tree, only_leaves=True)) + else: + print(self) + + def __str__(self): + return self.__print_tree(self._tree) def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int): """Save the dataset of the node in a csv file