Implement predict_proba with test.

Fix tree overload with dataset in nodes only needed in tests
This commit is contained in:
2020-05-14 18:42:17 +02:00
parent e3ae3a3a6c
commit e56b955b92
7 changed files with 154 additions and 281 deletions

View File

@@ -1,2 +1,14 @@
# STree # STree
Oblique Tree classifier based on SVM nodes Oblique Tree classifier based on SVM nodes
## Example
```python
python main.py
```
## Tests
```python
python -m unittest tests.Stree_test tests.Snode_test
```

11
main.py
View File

@@ -33,14 +33,15 @@ def load_creditcard(n_examples=0):
print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1]))) print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
print("Valid: {0:.3f}% {1}".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0]))) print("Valid: {0:.3f}% {1}".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))
return X, y return X, y
#X, y = load_creditcard(-5000) X, y = load_creditcard(-5000)
#X, y = load_creditcard(0) #X, y = load_creditcard()
clf = Stree(C=.01, max_iter=100, random_state=random_state) clf = Stree(C=.01, max_iter=100, random_state=random_state)
clf.fit(X, y) clf.fit(X, y)
print(clf) print(clf)
clf.show_tree() #clf.show_tree()
clf.save_sub_datasets() #clf.save_sub_datasets()
print(f"Predicting {y[0]} we have {clf.predict(X[0, :].reshape(-1, X.shape[1]))}") yp = clf.predict_proba(X[0, :].reshape(-1, X.shape[1]))
print(f"Predicting {y[0]} we have {yp[0, 0]} with {yp[0, 1]} of belief")
print(f"Classifier's accuracy: {clf.score(X, y, print_out=False):.4f}") print(f"Classifier's accuracy: {clf.score(X, y, print_out=False):.4f}")
clf.show_tree(only_leaves=True) clf.show_tree(only_leaves=True)

View File

@@ -1,5 +1,20 @@
{ {
"cells": [ "cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
"from trees.Stree import Stree\n",
"import time"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 2,
@@ -8,22 +23,14 @@
{ {
"output_type": "stream", "output_type": "stream",
"name": "stdout", "name": "stdout",
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nCPU times: user 2 µs, sys: 0 ns, total: 2 µs\nWall time: 5.96 µs\n" "text": "*Original Fraud: 0.173% 492\n*Original Valid: 99.827% 284315\nX.shape (284807, 28) y.shape (284807, 1)\n-Generated Fraud: 0.173% 492\n-Generated Valid: 99.827% 284315\n"
} }
], ],
"source": [ "source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
"from trees.Stree import Stree\n",
"\n",
"\n",
"def load_creditcard(n_examples=0):\n", "def load_creditcard(n_examples=0):\n",
" df = pd.read_csv('data/creditcard.csv')\n", " df = pd.read_csv('data/creditcard.csv')\n",
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n", " print(\"*Original Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n", " print(\"*Original Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
" y = np.expand_dims(df.Class.values, axis=1)\n", " y = np.expand_dims(df.Class.values, axis=1)\n",
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n", " X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
" #Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n", " #Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
@@ -39,12 +46,13 @@
" X = np.append(Xt, X[indices], axis=0)\n", " X = np.append(Xt, X[indices], axis=0)\n",
" y = np.append(yt, y[indices], axis=0)\n", " y = np.append(yt, y[indices], axis=0)\n",
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n", " print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n", " print(\"-Generated Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))\n", " print(\"-Generated Valid: {0:.3f}% {1}\".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))\n",
" return X, y\n", " return X, y\n",
"\n", "\n",
"random_state = 1\n", "random_state = 1\n",
"\n", "\n",
"\n",
"# Datasets\n", "# Datasets\n",
"\n", "\n",
"#X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n", "#X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n",
@@ -54,8 +62,7 @@
"#X, y = load_wine(return_X_y=True)\n", "#X, y = load_wine(return_X_y=True)\n",
"#X, y = load_iris(return_X_y=True)\n", "#X, y = load_iris(return_X_y=True)\n",
"\n", "\n",
"X, y = load_creditcard(23000)\n", "X, y = load_creditcard()"
"%time"
] ]
}, },
{ {
@@ -66,15 +73,15 @@
{ {
"output_type": "stream", "output_type": "stream",
"name": "stdout", "name": "stdout",
"text": "CPU times: user 3 µs, sys: 0 ns, total: 3 µs\nWall time: 5.01 µs\nAccuracy: 0.999609\nroot\nroot - Down(array([0, 1]), array([24, 63]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Down(array([0, 1]), array([ 1, 61])), classes=[0 1], items<0>=1, items<1>=61, <couldn't go any further> LEAF accuracy=0.98, belief=61.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Up(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1]))\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Up(array([0]), array([23])), class=[0], items=23, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Down(array([1]), array([8])), class=[1], items=8, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Up(array([0]), array([3])), class=[0], items=3, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Down(array([1]), array([2])), class=[1], items=2, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Up(array([0]), array([2])), class=[0], items=2, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Down(array([1]), array([4])), class=[1], items=4, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Up(array([0]), array([1])), class=[0], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Up(array([0, 1]), array([22884, 8])), classes=[0 1], items<0>=22884, items<1>=8, <couldn't go any further> LEAF accuracy=1.00, belief=2860.50 class=[0]\n\n" "text": "root\nroot - Down\nLeaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\nroot - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Down - Up - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.920000 counts=(array([0, 1]), array([23, 2]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nLeaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nLeaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.857143 counts=(array([0, 1]), array([18, 3]))\nLeaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242, 103]))\n\n44.3767 secs\n"
} }
], ],
"source": [ "source": [
"%time\n", "t = time.time()\n",
"clf = Stree(random_state=random_state, use_predictions=False)\n", "clf = Stree(C=.01, random_state=random_state, use_predictions=False)\n",
"clf.fit(X, y)\n", "clf.fit(X, y)\n",
"clf.score(X, y)\n", "print(clf)\n",
"print(clf)" "print(f\"{time.time() - t:.4f} secs\")"
] ]
}, },
{ {
@@ -85,22 +92,13 @@
{ {
"output_type": "stream", "output_type": "stream",
"name": "stdout", "name": "stdout",
"text": "CPU times: user 4 µs, sys: 5 µs, total: 9 µs\nWall time: 12.2 µs\n" "text": "Accuracy: 0.999512\n33.1651 secs\n"
},
{
"output_type": "execute_result",
"data": {
"text/plain": "0.9979565217391304"
},
"metadata": {},
"execution_count": 4
} }
], ],
"source": [ "source": [
"%time\n", "t = time.time()\n",
"clf2 = LinearSVC(random_state=random_state)\n", "clf.score(X, y)\n",
"clf2.fit(X, y)\n", "print(f\"{time.time() - t:.4f} secs\")"
"clf2.score(X, y)"
] ]
}, },
{ {
@@ -111,22 +109,14 @@
{ {
"output_type": "stream", "output_type": "stream",
"name": "stdout", "name": "stdout",
"text": "CPU times: user 13 µs, sys: 5 µs, total: 18 µs\nWall time: 7.87 µs\n" "text": "(284807, 2)\n87.5212 secs\n"
},
{
"output_type": "execute_result",
"data": {
"text/plain": "1.0"
},
"metadata": {},
"execution_count": 5
} }
], ],
"source": [ "source": [
"%time\n", "t = time.time()\n",
"clf3 = DecisionTreeClassifier(random_state=random_state)\n", "yp = clf.predict_proba(X)\n",
"clf3.fit(X, y)\n", "print(yp.shape)\n",
"clf3.score(X, y)" "print(f\"{time.time() - t:.4f} secs\")"
] ]
}, },
{ {
@@ -137,11 +127,15 @@
{ {
"output_type": "stream", "output_type": "stream",
"name": "stdout", "name": "stdout",
"text": "root\nroot - Down(array([0, 1]), array([24, 63]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Down(array([0, 1]), array([ 1, 61])), classes=[0 1], items<0>=1, items<1>=61, <couldn't go any further> LEAF accuracy=0.98, belief=61.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Up(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1]))\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Up(array([0]), array([23])), class=[0], items=23, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Down(array([1]), array([8])), class=[1], items=8, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Up(array([0]), array([3])), class=[0], items=3, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Down(array([1]), array([2])), class=[1], items=2, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Up(array([0]), array([2])), class=[0], items=2, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Down(array([1]), array([4])), class=[1], items=4, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Up(array([0]), array([1])), class=[0], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Up(array([0, 1]), array([22884, 8])), classes=[0 1], items<0>=22884, items<1>=8, <couldn't go any further> LEAF accuracy=1.00, belief=2860.50 class=[0]\n\n" "text": "0.9991397683343457\n12.6601 secs\n"
} }
], ],
"source": [ "source": [
"print(clf)" "t = time.time()\n",
"clf2 = LinearSVC(C=.01, random_state=random_state)\n",
"clf2.fit(X, y)\n",
"print(clf2.score(X, y))\n",
"print(f\"{time.time() - t:.4f} secs\")"
] ]
}, },
{ {
@@ -152,230 +146,16 @@
{ {
"output_type": "stream", "output_type": "stream",
"name": "stdout", "name": "stdout",
"text": "22884 8\n" "text": "1.0\n18.2638 secs\n"
} }
], ],
"source": [ "source": [
"a=[22884, 8]\n", "t = time.time()\n",
"b=max(a)\n", "clf3 = DecisionTreeClassifier(random_state=random_state)\n",
"c=min(a)\n", "clf3.fit(X, y)\n",
"print(b,c)" "print(clf3.score(X, y))\n",
"print(f\"{time.time() - t:.4f} secs\")"
] ]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "0.9996505329372707"
},
"metadata": {},
"execution_count": 9
}
],
"source": [
"b/(b+c)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "(23000, 1)"
},
"metadata": {},
"execution_count": 10
}
],
"source": [
"y.shape"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"k=y[:4,:]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "(4, 1)"
},
"metadata": {},
"execution_count": 15
}
],
"source": [
"k.shape"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "(4,)"
},
"metadata": {},
"execution_count": 17
}
],
"source": [
"k.ravel().ravel().shape"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "array([[0],\n [0]])"
},
"metadata": {},
"execution_count": 20
}
],
"source": [
"k[[True, False, True, False]]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "(23000, 28)"
},
"metadata": {},
"execution_count": 21
}
],
"source": [
"X.shape"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"k = X[(y==1).ravel()]"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "(86, 28)"
},
"metadata": {},
"execution_count": 29
}
],
"source": [
"k.shape"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"indices = np.random.random_integers(0, X.shape[0], 2000)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "(2000, 28)"
},
"metadata": {},
"execution_count": 39
}
],
"source": [
"X[indices].shape"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"k=random.shuffle(X)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"k"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "[4, 9, 8, 6, 5, 1]"
},
"metadata": {},
"execution_count": 45
}
],
"source": [
"random.sample(range(10), 6)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {

View File

@@ -1,6 +1,7 @@
import unittest import unittest
from sklearn.datasets import make_classification from sklearn.datasets import make_classification
import os
import numpy as np import numpy as np
import csv import csv
@@ -10,12 +11,20 @@ from trees.Stree import Stree, Snode
class Snode_test(unittest.TestCase): class Snode_test(unittest.TestCase):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
os.environ['TESTING'] = '1'
self._random_state = 1 self._random_state = 1
self._clf = Stree(random_state=self._random_state, self._clf = Stree(random_state=self._random_state,
use_predictions=True) use_predictions=True)
self._clf.fit(*self._get_Xy()) self._clf.fit(*self._get_Xy())
super(Snode_test, self).__init__(*args, **kwargs) super(Snode_test, self).__init__(*args, **kwargs)
@classmethod
def tearDownClass(cls):
try:
os.environ.pop('TESTING')
except:
pass
def _get_Xy(self): def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,

View File

@@ -1,6 +1,7 @@
import unittest import unittest
from sklearn.datasets import make_classification from sklearn.datasets import make_classification
import os
import numpy as np import numpy as np
import csv import csv
@@ -10,12 +11,20 @@ from trees.Stree import Stree, Snode
class Stree_test(unittest.TestCase): class Stree_test(unittest.TestCase):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
os.environ['TESTING'] = '1'
self._random_state = 1 self._random_state = 1
self._clf = Stree(random_state=self._random_state, self._clf = Stree(random_state=self._random_state,
use_predictions=False) use_predictions=False)
self._clf.fit(*self._get_Xy()) self._clf.fit(*self._get_Xy())
super(Stree_test, self).__init__(*args, **kwargs) super(Stree_test, self).__init__(*args, **kwargs)
@classmethod
def tearDownClass(cls):
try:
os.environ.pop('TESTING')
except:
pass
def _get_Xy(self): def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
@@ -112,9 +121,11 @@ class Stree_test(unittest.TestCase):
self.assertEqual(yp[0], y[0]) self.assertEqual(yp[0], y[0])
def test_multiple_prediction(self): def test_multiple_prediction(self):
# First 27 elements the predictions are the same as the truth
num = 27
X, y = self._get_Xy() X, y = self._get_Xy()
yp = self._clf.predict(X[:23, :]) yp = self._clf.predict(X[:num, :])
self.assertListEqual(y[:23].tolist(), yp.tolist()) self.assertListEqual(y[:num].tolist(), yp.tolist())
def test_score(self): def test_score(self):
X, y = self._get_Xy() X, y = self._get_Xy()
@@ -123,3 +134,26 @@ class Stree_test(unittest.TestCase):
right = (yp == y).astype(int) right = (yp == y).astype(int)
accuracy_computed = sum(right) / len(y) accuracy_computed = sum(right) / len(y)
self.assertEqual(accuracy_score, accuracy_computed) self.assertEqual(accuracy_score, accuracy_computed)
def test_single_predict_proba(self):
# Element 28 has a different prediction than the truth
X, y = self._get_Xy()
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
self.assertEqual(0, yp[0:, 0])
self.assertEqual(0.9282970550576184, yp[0:, 1])
def test_multiple_predict_proba(self):
# First 27 elements the predictions are the same as the truth
num = 27
X, y = self._get_Xy()
yp = self._clf.predict_proba(X[:num, :])
self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
expected_proba = [0.9759887, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.9759887,
0.92829706, 0.9759887, 0.9759887, 0.9759887, 0.9759887, 0.92829706,
0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.92829706, 0.92829706,
0.9759887, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.92829706,
0.92829706, 0.92829706, 0.9759887 ]
self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())

View File

@@ -6,6 +6,7 @@ __version__ = "0.9"
Node of the Stree (binary tree) Node of the Stree (binary tree)
''' '''
import os
import numpy as np import numpy as np
from sklearn.svm import LinearSVC from sklearn.svm import LinearSVC
@@ -17,11 +18,12 @@ class Snode:
self._interceptor = 0. if clf is None else clf.intercept_ self._interceptor = 0. if clf is None else clf.intercept_
self._title = title self._title = title
self._belief = 0. # belief of the prediction in a leaf node based on samples self._belief = 0. # belief of the prediction in a leaf node based on samples
self._X = X self._X = X if os.environ.get(
'TESTING', 'Not Set') != 'Not Set' else None
self._y = y self._y = y
self._down = None self._down = None
self._up = None self._up = None
self._class = None # really needed? self._class = None
def set_down(self, son): def set_down(self, son):
self._down = son self._down = son
@@ -42,6 +44,9 @@ class Snode:
"""Compute the class of the predictor and its belief based on the subdataset of the node """Compute the class of the predictor and its belief based on the subdataset of the node
only if it is a leaf only if it is a leaf
""" """
# Clean memory
#self._X = None
#self._y = None
if not self.is_leaf(): if not self.is_leaf():
return return
classes, card = np.unique(self._y, return_counts=True) classes, card = np.unique(self._y, return_counts=True)

View File

@@ -1,3 +1,4 @@
# This Python file uses the following encoding: utf-8
''' '''
__author__ = "Ricardo Montañana Gómez" __author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez" __copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
@@ -10,23 +11,37 @@ Uses LinearSVC
import numpy as np import numpy as np
import typing import typing
from sklearn.svm import LinearSVC from sklearn.svm import LinearSVC
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from trees.Snode import Snode from trees.Snode import Snode
class Stree: class Stree(BaseEstimator, ClassifierMixin):
""" """
""" """
def __init__(self, C=1.0, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False): def __init__(self, C=1.0, max_iter: int=1000, random_state: int=0, use_predictions: bool=False):
self._max_iter = max_iter self._max_iter = max_iter
self._C = C self._C = C
self._random_state = random_state self._random_state = random_state
self._outcomes = None
self._tree = None self._tree = None
self.__folder = 'data/' self.__folder = 'data/'
self.__use_predictions = use_predictions self.__use_predictions = use_predictions
self.__trained = False self.__trained = False
self.__proba = False
def get_params(self, deep=True):
"""Get dict with hyperparameters and its values to accomplish sklearn rules
"""
return {"C": self._C, "random_state": self._random_state, 'max_iter': self._max_iter}
def set_params(self, **parameters):
"""Set hyperparmeters as specified by sklearn, needed in Gridsearchs
"""
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self
def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list: def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list:
if self.__use_predictions: if self.__use_predictions:
@@ -47,6 +62,8 @@ class Stree:
return [X_up, y_up, X_down, y_down] return [X_up, y_up, X_down, y_down]
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree': def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
X, y = check_X_y(X, y)
self.n_features_in_ = X.shape[1]
self._tree = self.train(X, y.ravel(), title) self._tree = self.train(X, y.ravel(), title)
self._build_predictor() self._build_predictor()
self.__trained = True self.__trained = True
@@ -83,16 +100,31 @@ class Stree:
def predict(self, X: np.array) -> np.array: def predict(self, X: np.array) -> np.array:
def predict_class(xp: np.array, tree: Snode) -> np.array: def predict_class(xp: np.array, tree: Snode) -> np.array:
if tree.is_leaf(): if tree.is_leaf():
return tree._class if self.__proba:
return [tree._class, tree._belief]
else:
return tree._class
coef = tree._vector[0, :].reshape(-1, xp.shape[1]) coef = tree._vector[0, :].reshape(-1, xp.shape[1])
if xp.dot(coef.T) + tree._interceptor[0] > 0: if xp.dot(coef.T) + tree._interceptor[0] > 0:
return predict_class(xp, tree.get_down()) return predict_class(xp, tree.get_down())
return predict_class(xp, tree.get_up()) return predict_class(xp, tree.get_up())
# sklearn check
check_is_fitted(self)
# Input validation
X = check_array(X)
# setup prediction & make it happen
y = np.array([], dtype=int) y = np.array([], dtype=int)
for xp in X: for xp in X:
y = np.append(y, predict_class(xp.reshape(-1, X.shape[1]), self._tree)) y = np.append(y, predict_class(xp.reshape(-1, X.shape[1]), self._tree))
return y return y
def predict_proba(self, X: np.array) -> np.array:
self.__proba = True
result = self.predict(X).reshape(X.shape[0], 2)
self.__proba = False
return result
def score(self, X: np.array, y: np.array, print_out=True) -> float: def score(self, X: np.array, y: np.array, print_out=True) -> float:
if not self.__trained: if not self.__trained:
self.fit(X, y) self.fit(X, y)