diff --git a/README.md b/README.md index d9e2ffd..3bbea76 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,14 @@ # STree Oblique Tree classifier based on SVM nodes + +## Example + +```python +python main.py +``` + +## Tests + +```python +python -m unittest tests.Stree_test tests.Snode_test +``` diff --git a/main.py b/main.py index f98e6ec..9179ccb 100644 --- a/main.py +++ b/main.py @@ -33,14 +33,15 @@ def load_creditcard(n_examples=0): print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1]))) print("Valid: {0:.3f}% {1}".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0]))) return X, y -#X, y = load_creditcard(-5000) -#X, y = load_creditcard(0) +X, y = load_creditcard(-5000) +#X, y = load_creditcard() clf = Stree(C=.01, max_iter=100, random_state=random_state) clf.fit(X, y) print(clf) -clf.show_tree() -clf.save_sub_datasets() -print(f"Predicting {y[0]} we have {clf.predict(X[0, :].reshape(-1, X.shape[1]))}") +#clf.show_tree() +#clf.save_sub_datasets() +yp = clf.predict_proba(X[0, :].reshape(-1, X.shape[1])) +print(f"Predicting {y[0]} we have {yp[0, 0]} with {yp[0, 1]} of belief") print(f"Classifier's accuracy: {clf.score(X, y, print_out=False):.4f}") clf.show_tree(only_leaves=True) diff --git a/test.ipynb b/test.ipynb index 9d9e0ad..0cb291e 100644 --- a/test.ipynb +++ b/test.ipynb @@ -1,5 +1,20 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.datasets import make_classification, load_iris, load_wine\n", + "from trees.Stree import Stree\n", + "import time" + ] + }, { "cell_type": "code", "execution_count": 2, @@ -8,22 +23,14 @@ { "output_type": "stream", "name": "stdout", - "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nCPU times: user 2 µs, sys: 0 ns, total: 2 µs\nWall time: 5.96 µs\n" + "text": "*Original Fraud: 0.173% 492\n*Original Valid: 99.827% 284315\nX.shape (284807, 28) y.shape (284807, 1)\n-Generated Fraud: 0.173% 492\n-Generated Valid: 99.827% 284315\n" } ], "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from sklearn.svm import LinearSVC\n", - "from sklearn.tree import DecisionTreeClassifier\n", - "from sklearn.datasets import make_classification, load_iris, load_wine\n", - "from trees.Stree import Stree\n", - "\n", - "\n", "def load_creditcard(n_examples=0):\n", " df = pd.read_csv('data/creditcard.csv')\n", - " print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n", - " print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n", + " print(\"*Original Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n", + " print(\"*Original Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n", " y = np.expand_dims(df.Class.values, axis=1)\n", " X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n", " #Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n", @@ -39,12 +46,13 @@ " X = np.append(Xt, X[indices], axis=0)\n", " y = np.append(yt, y[indices], axis=0)\n", " print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n", - " print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n", - " print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))\n", + " print(\"-Generated Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n", + " print(\"-Generated Valid: {0:.3f}% {1}\".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))\n", " return X, y\n", "\n", "random_state = 1\n", "\n", + "\n", "# Datasets\n", "\n", "#X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n", @@ -54,8 +62,7 @@ "#X, y = load_wine(return_X_y=True)\n", "#X, y = load_iris(return_X_y=True)\n", "\n", - "X, y = load_creditcard(23000)\n", - "%time" + "X, y = load_creditcard()" ] }, { @@ -66,15 +73,15 @@ { "output_type": "stream", "name": "stdout", - "text": "CPU times: user 3 µs, sys: 0 ns, total: 3 µs\nWall time: 5.01 µs\nAccuracy: 0.999609\nroot\nroot - Down(array([0, 1]), array([24, 63]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Down(array([0, 1]), array([ 1, 61])), classes=[0 1], items<0>=1, items<1>=61, LEAF accuracy=0.98, belief=61.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Up(array([1]), array([1])), class=[1], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1]))\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Up(array([0]), array([23])), class=[0], items=23, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Down(array([1]), array([8])), class=[1], items=8, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Up(array([0]), array([3])), class=[0], items=3, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Down(array([1]), array([2])), class=[1], items=2, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Up(array([0]), array([2])), class=[0], items=2, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Down(array([1]), array([4])), class=[1], items=4, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Up(array([0]), array([1])), class=[0], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Up(array([0, 1]), array([22884, 8])), classes=[0 1], items<0>=22884, items<1>=8, LEAF accuracy=1.00, belief=2860.50 class=[0]\n\n" + "text": "root\nroot - Down\nLeaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\nroot - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Down - Up - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.920000 counts=(array([0, 1]), array([23, 2]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nLeaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nLeaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.857143 counts=(array([0, 1]), array([18, 3]))\nLeaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242, 103]))\n\n44.3767 secs\n" } ], "source": [ - "%time\n", - "clf = Stree(random_state=random_state, use_predictions=False)\n", + "t = time.time()\n", + "clf = Stree(C=.01, random_state=random_state, use_predictions=False)\n", "clf.fit(X, y)\n", - "clf.score(X, y)\n", - "print(clf)" + "print(clf)\n", + "print(f\"{time.time() - t:.4f} secs\")" ] }, { @@ -85,22 +92,13 @@ { "output_type": "stream", "name": "stdout", - "text": "CPU times: user 4 µs, sys: 5 µs, total: 9 µs\nWall time: 12.2 µs\n" - }, - { - "output_type": "execute_result", - "data": { - "text/plain": "0.9979565217391304" - }, - "metadata": {}, - "execution_count": 4 + "text": "Accuracy: 0.999512\n33.1651 secs\n" } ], "source": [ - "%time\n", - "clf2 = LinearSVC(random_state=random_state)\n", - "clf2.fit(X, y)\n", - "clf2.score(X, y)" + "t = time.time()\n", + "clf.score(X, y)\n", + "print(f\"{time.time() - t:.4f} secs\")" ] }, { @@ -111,22 +109,14 @@ { "output_type": "stream", "name": "stdout", - "text": "CPU times: user 13 µs, sys: 5 µs, total: 18 µs\nWall time: 7.87 µs\n" - }, - { - "output_type": "execute_result", - "data": { - "text/plain": "1.0" - }, - "metadata": {}, - "execution_count": 5 + "text": "(284807, 2)\n87.5212 secs\n" } ], "source": [ - "%time\n", - "clf3 = DecisionTreeClassifier(random_state=random_state)\n", - "clf3.fit(X, y)\n", - "clf3.score(X, y)" + "t = time.time()\n", + "yp = clf.predict_proba(X)\n", + "print(yp.shape)\n", + "print(f\"{time.time() - t:.4f} secs\")" ] }, { @@ -137,11 +127,15 @@ { "output_type": "stream", "name": "stdout", - "text": "root\nroot - Down(array([0, 1]), array([24, 63]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Down(array([0, 1]), array([ 1, 61])), classes=[0 1], items<0>=1, items<1>=61, LEAF accuracy=0.98, belief=61.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Up(array([1]), array([1])), class=[1], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1]))\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Up(array([0]), array([23])), class=[0], items=23, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Down(array([1]), array([8])), class=[1], items=8, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Up(array([0]), array([3])), class=[0], items=3, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Down(array([1]), array([2])), class=[1], items=2, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Up(array([0]), array([2])), class=[0], items=2, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Down(array([1]), array([4])), class=[1], items=4, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Up(array([0]), array([1])), class=[0], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Up(array([0, 1]), array([22884, 8])), classes=[0 1], items<0>=22884, items<1>=8, LEAF accuracy=1.00, belief=2860.50 class=[0]\n\n" + "text": "0.9991397683343457\n12.6601 secs\n" } ], "source": [ - "print(clf)" + "t = time.time()\n", + "clf2 = LinearSVC(C=.01, random_state=random_state)\n", + "clf2.fit(X, y)\n", + "print(clf2.score(X, y))\n", + "print(f\"{time.time() - t:.4f} secs\")" ] }, { @@ -152,230 +146,16 @@ { "output_type": "stream", "name": "stdout", - "text": "22884 8\n" + "text": "1.0\n18.2638 secs\n" } ], "source": [ - "a=[22884, 8]\n", - "b=max(a)\n", - "c=min(a)\n", - "print(b,c)" + "t = time.time()\n", + "clf3 = DecisionTreeClassifier(random_state=random_state)\n", + "clf3.fit(X, y)\n", + "print(clf3.score(X, y))\n", + "print(f\"{time.time() - t:.4f} secs\")" ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": "0.9996505329372707" - }, - "metadata": {}, - "execution_count": 9 - } - ], - "source": [ - "b/(b+c)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": "(23000, 1)" - }, - "metadata": {}, - "execution_count": 10 - } - ], - "source": [ - "y.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "k=y[:4,:]" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": "(4, 1)" - }, - "metadata": {}, - "execution_count": 15 - } - ], - "source": [ - "k.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": "(4,)" - }, - "metadata": {}, - "execution_count": 17 - } - ], - "source": [ - "k.ravel().ravel().shape" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": "array([[0],\n [0]])" - }, - "metadata": {}, - "execution_count": 20 - } - ], - "source": [ - "k[[True, False, True, False]]" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": "(23000, 28)" - }, - "metadata": {}, - "execution_count": 21 - } - ], - "source": [ - "X.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "k = X[(y==1).ravel()]" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": "(86, 28)" - }, - "metadata": {}, - "execution_count": 29 - } - ], - "source": [ - "k.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "indices = np.random.random_integers(0, X.shape[0], 2000)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": "(2000, 28)" - }, - "metadata": {}, - "execution_count": 39 - } - ], - "source": [ - "X[indices].shape" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "import random\n", - "k=random.shuffle(X)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "k" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": "[4, 9, 8, 6, 5, 1]" - }, - "metadata": {}, - "execution_count": 45 - } - ], - "source": [ - "random.sample(range(10), 6)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/tests/Snode_test.py b/tests/Snode_test.py index 1d71713..43609bd 100644 --- a/tests/Snode_test.py +++ b/tests/Snode_test.py @@ -1,6 +1,7 @@ import unittest from sklearn.datasets import make_classification +import os import numpy as np import csv @@ -10,12 +11,20 @@ from trees.Stree import Stree, Snode class Snode_test(unittest.TestCase): def __init__(self, *args, **kwargs): + os.environ['TESTING'] = '1' self._random_state = 1 self._clf = Stree(random_state=self._random_state, use_predictions=True) self._clf.fit(*self._get_Xy()) super(Snode_test, self).__init__(*args, **kwargs) + @classmethod + def tearDownClass(cls): + try: + os.environ.pop('TESTING') + except: + pass + def _get_Xy(self): X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, diff --git a/tests/Stree_test.py b/tests/Stree_test.py index 489487e..60db265 100644 --- a/tests/Stree_test.py +++ b/tests/Stree_test.py @@ -1,6 +1,7 @@ import unittest from sklearn.datasets import make_classification +import os import numpy as np import csv @@ -10,12 +11,20 @@ from trees.Stree import Stree, Snode class Stree_test(unittest.TestCase): def __init__(self, *args, **kwargs): + os.environ['TESTING'] = '1' self._random_state = 1 self._clf = Stree(random_state=self._random_state, use_predictions=False) self._clf.fit(*self._get_Xy()) super(Stree_test, self).__init__(*args, **kwargs) + @classmethod + def tearDownClass(cls): + try: + os.environ.pop('TESTING') + except: + pass + def _get_Xy(self): X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, @@ -112,9 +121,11 @@ class Stree_test(unittest.TestCase): self.assertEqual(yp[0], y[0]) def test_multiple_prediction(self): + # First 27 elements the predictions are the same as the truth + num = 27 X, y = self._get_Xy() - yp = self._clf.predict(X[:23, :]) - self.assertListEqual(y[:23].tolist(), yp.tolist()) + yp = self._clf.predict(X[:num, :]) + self.assertListEqual(y[:num].tolist(), yp.tolist()) def test_score(self): X, y = self._get_Xy() @@ -123,3 +134,26 @@ class Stree_test(unittest.TestCase): right = (yp == y).astype(int) accuracy_computed = sum(right) / len(y) self.assertEqual(accuracy_score, accuracy_computed) + + def test_single_predict_proba(self): + # Element 28 has a different prediction than the truth + X, y = self._get_Xy() + yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1])) + self.assertEqual(0, yp[0:, 0]) + self.assertEqual(0.9282970550576184, yp[0:, 1]) + + def test_multiple_predict_proba(self): + # First 27 elements the predictions are the same as the truth + num = 27 + X, y = self._get_Xy() + yp = self._clf.predict_proba(X[:num, :]) + self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist()) + expected_proba = [0.9759887, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.9759887, + 0.92829706, 0.9759887, 0.9759887, 0.9759887, 0.9759887, 0.92829706, + 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.92829706, 0.92829706, + 0.9759887, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.92829706, + 0.92829706, 0.92829706, 0.9759887 ] + self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist()) + + + diff --git a/trees/Snode.py b/trees/Snode.py index 037ba6d..8e1ca59 100644 --- a/trees/Snode.py +++ b/trees/Snode.py @@ -6,6 +6,7 @@ __version__ = "0.9" Node of the Stree (binary tree) ''' +import os import numpy as np from sklearn.svm import LinearSVC @@ -17,11 +18,12 @@ class Snode: self._interceptor = 0. if clf is None else clf.intercept_ self._title = title self._belief = 0. # belief of the prediction in a leaf node based on samples - self._X = X + self._X = X if os.environ.get( + 'TESTING', 'Not Set') != 'Not Set' else None self._y = y self._down = None self._up = None - self._class = None # really needed? + self._class = None def set_down(self, son): self._down = son @@ -42,6 +44,9 @@ class Snode: """Compute the class of the predictor and its belief based on the subdataset of the node only if it is a leaf """ + # Clean memory + #self._X = None + #self._y = None if not self.is_leaf(): return classes, card = np.unique(self._y, return_counts=True) diff --git a/trees/Stree.py b/trees/Stree.py index b648ba1..25c9fe9 100644 --- a/trees/Stree.py +++ b/trees/Stree.py @@ -1,3 +1,4 @@ +# This Python file uses the following encoding: utf-8 ''' __author__ = "Ricardo Montañana Gómez" __copyright__ = "Copyright 2020, Ricardo Montañana Gómez" @@ -10,23 +11,37 @@ Uses LinearSVC import numpy as np import typing from sklearn.svm import LinearSVC +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from trees.Snode import Snode -class Stree: +class Stree(BaseEstimator, ClassifierMixin): """ """ - def __init__(self, C=1.0, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False): + def __init__(self, C=1.0, max_iter: int=1000, random_state: int=0, use_predictions: bool=False): self._max_iter = max_iter self._C = C self._random_state = random_state - self._outcomes = None self._tree = None self.__folder = 'data/' self.__use_predictions = use_predictions self.__trained = False + self.__proba = False + + def get_params(self, deep=True): + """Get dict with hyperparameters and its values to accomplish sklearn rules + """ + return {"C": self._C, "random_state": self._random_state, 'max_iter': self._max_iter} + + def set_params(self, **parameters): + """Set hyperparmeters as specified by sklearn, needed in Gridsearchs + """ + for parameter, value in parameters.items(): + setattr(self, parameter, value) + return self def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list: if self.__use_predictions: @@ -47,6 +62,8 @@ class Stree: return [X_up, y_up, X_down, y_down] def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree': + X, y = check_X_y(X, y) + self.n_features_in_ = X.shape[1] self._tree = self.train(X, y.ravel(), title) self._build_predictor() self.__trained = True @@ -83,16 +100,31 @@ class Stree: def predict(self, X: np.array) -> np.array: def predict_class(xp: np.array, tree: Snode) -> np.array: if tree.is_leaf(): - return tree._class + if self.__proba: + return [tree._class, tree._belief] + else: + return tree._class coef = tree._vector[0, :].reshape(-1, xp.shape[1]) if xp.dot(coef.T) + tree._interceptor[0] > 0: return predict_class(xp, tree.get_down()) return predict_class(xp, tree.get_up()) + + # sklearn check + check_is_fitted(self) + # Input validation + X = check_array(X) + # setup prediction & make it happen y = np.array([], dtype=int) for xp in X: y = np.append(y, predict_class(xp.reshape(-1, X.shape[1]), self._tree)) return y + def predict_proba(self, X: np.array) -> np.array: + self.__proba = True + result = self.predict(X).reshape(X.shape[0], 2) + self.__proba = False + return result + def score(self, X: np.array, y: np.array, print_out=True) -> float: if not self.__trained: self.fit(X, y)