mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-15 15:36:00 +00:00
Implement predict_proba with test.
Fix tree overload with dataset in nodes only needed in tests
This commit is contained in:
12
README.md
12
README.md
@@ -1,2 +1,14 @@
|
||||
# STree
|
||||
Oblique Tree classifier based on SVM nodes
|
||||
|
||||
## Example
|
||||
|
||||
```python
|
||||
python main.py
|
||||
```
|
||||
|
||||
## Tests
|
||||
|
||||
```python
|
||||
python -m unittest tests.Stree_test tests.Snode_test
|
||||
```
|
||||
|
11
main.py
11
main.py
@@ -33,14 +33,15 @@ def load_creditcard(n_examples=0):
|
||||
print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
|
||||
print("Valid: {0:.3f}% {1}".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))
|
||||
return X, y
|
||||
#X, y = load_creditcard(-5000)
|
||||
#X, y = load_creditcard(0)
|
||||
X, y = load_creditcard(-5000)
|
||||
#X, y = load_creditcard()
|
||||
|
||||
clf = Stree(C=.01, max_iter=100, random_state=random_state)
|
||||
clf.fit(X, y)
|
||||
print(clf)
|
||||
clf.show_tree()
|
||||
clf.save_sub_datasets()
|
||||
print(f"Predicting {y[0]} we have {clf.predict(X[0, :].reshape(-1, X.shape[1]))}")
|
||||
#clf.show_tree()
|
||||
#clf.save_sub_datasets()
|
||||
yp = clf.predict_proba(X[0, :].reshape(-1, X.shape[1]))
|
||||
print(f"Predicting {y[0]} we have {yp[0, 0]} with {yp[0, 1]} of belief")
|
||||
print(f"Classifier's accuracy: {clf.score(X, y, print_out=False):.4f}")
|
||||
clf.show_tree(only_leaves=True)
|
||||
|
316
test.ipynb
316
test.ipynb
@@ -1,5 +1,20 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.svm import LinearSVC\n",
|
||||
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
|
||||
"from trees.Stree import Stree\n",
|
||||
"import time"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
@@ -8,22 +23,14 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nCPU times: user 2 µs, sys: 0 ns, total: 2 µs\nWall time: 5.96 µs\n"
|
||||
"text": "*Original Fraud: 0.173% 492\n*Original Valid: 99.827% 284315\nX.shape (284807, 28) y.shape (284807, 1)\n-Generated Fraud: 0.173% 492\n-Generated Valid: 99.827% 284315\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.svm import LinearSVC\n",
|
||||
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
|
||||
"from trees.Stree import Stree\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def load_creditcard(n_examples=0):\n",
|
||||
" df = pd.read_csv('data/creditcard.csv')\n",
|
||||
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
||||
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
|
||||
" print(\"*Original Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
||||
" print(\"*Original Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
|
||||
" y = np.expand_dims(df.Class.values, axis=1)\n",
|
||||
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
|
||||
" #Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
||||
@@ -39,12 +46,13 @@
|
||||
" X = np.append(Xt, X[indices], axis=0)\n",
|
||||
" y = np.append(yt, y[indices], axis=0)\n",
|
||||
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
|
||||
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
||||
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))\n",
|
||||
" print(\"-Generated Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
||||
" print(\"-Generated Valid: {0:.3f}% {1}\".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))\n",
|
||||
" return X, y\n",
|
||||
"\n",
|
||||
"random_state = 1\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Datasets\n",
|
||||
"\n",
|
||||
"#X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n",
|
||||
@@ -54,8 +62,7 @@
|
||||
"#X, y = load_wine(return_X_y=True)\n",
|
||||
"#X, y = load_iris(return_X_y=True)\n",
|
||||
"\n",
|
||||
"X, y = load_creditcard(23000)\n",
|
||||
"%time"
|
||||
"X, y = load_creditcard()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -66,15 +73,15 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "CPU times: user 3 µs, sys: 0 ns, total: 3 µs\nWall time: 5.01 µs\nAccuracy: 0.999609\nroot\nroot - Down(array([0, 1]), array([24, 63]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Down(array([0, 1]), array([ 1, 61])), classes=[0 1], items<0>=1, items<1>=61, <couldn't go any further> LEAF accuracy=0.98, belief=61.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Up(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1]))\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Up(array([0]), array([23])), class=[0], items=23, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Down(array([1]), array([8])), class=[1], items=8, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Up(array([0]), array([3])), class=[0], items=3, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Down(array([1]), array([2])), class=[1], items=2, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Up(array([0]), array([2])), class=[0], items=2, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Down(array([1]), array([4])), class=[1], items=4, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Up(array([0]), array([1])), class=[0], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Up(array([0, 1]), array([22884, 8])), classes=[0 1], items<0>=22884, items<1>=8, <couldn't go any further> LEAF accuracy=1.00, belief=2860.50 class=[0]\n\n"
|
||||
"text": "root\nroot - Down\nLeaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\nroot - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Down - Up - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.920000 counts=(array([0, 1]), array([23, 2]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nLeaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nLeaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.857143 counts=(array([0, 1]), array([18, 3]))\nLeaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242, 103]))\n\n44.3767 secs\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%time\n",
|
||||
"clf = Stree(random_state=random_state, use_predictions=False)\n",
|
||||
"t = time.time()\n",
|
||||
"clf = Stree(C=.01, random_state=random_state, use_predictions=False)\n",
|
||||
"clf.fit(X, y)\n",
|
||||
"clf.score(X, y)\n",
|
||||
"print(clf)"
|
||||
"print(clf)\n",
|
||||
"print(f\"{time.time() - t:.4f} secs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -85,22 +92,13 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "CPU times: user 4 µs, sys: 5 µs, total: 9 µs\nWall time: 12.2 µs\n"
|
||||
},
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "0.9979565217391304"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 4
|
||||
"text": "Accuracy: 0.999512\n33.1651 secs\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%time\n",
|
||||
"clf2 = LinearSVC(random_state=random_state)\n",
|
||||
"clf2.fit(X, y)\n",
|
||||
"clf2.score(X, y)"
|
||||
"t = time.time()\n",
|
||||
"clf.score(X, y)\n",
|
||||
"print(f\"{time.time() - t:.4f} secs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -111,22 +109,14 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "CPU times: user 13 µs, sys: 5 µs, total: 18 µs\nWall time: 7.87 µs\n"
|
||||
},
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "1.0"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 5
|
||||
"text": "(284807, 2)\n87.5212 secs\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%time\n",
|
||||
"clf3 = DecisionTreeClassifier(random_state=random_state)\n",
|
||||
"clf3.fit(X, y)\n",
|
||||
"clf3.score(X, y)"
|
||||
"t = time.time()\n",
|
||||
"yp = clf.predict_proba(X)\n",
|
||||
"print(yp.shape)\n",
|
||||
"print(f\"{time.time() - t:.4f} secs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -137,11 +127,15 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "root\nroot - Down(array([0, 1]), array([24, 63]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Down(array([0, 1]), array([ 1, 61])), classes=[0 1], items<0>=1, items<1>=61, <couldn't go any further> LEAF accuracy=0.98, belief=61.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Up(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1]))\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Up(array([0]), array([23])), class=[0], items=23, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Down(array([1]), array([8])), class=[1], items=8, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Up(array([0]), array([3])), class=[0], items=3, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Down(array([1]), array([2])), class=[1], items=2, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Up(array([0]), array([2])), class=[0], items=2, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Down(array([1]), array([4])), class=[1], items=4, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Up(array([0]), array([1])), class=[0], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Up(array([0, 1]), array([22884, 8])), classes=[0 1], items<0>=22884, items<1>=8, <couldn't go any further> LEAF accuracy=1.00, belief=2860.50 class=[0]\n\n"
|
||||
"text": "0.9991397683343457\n12.6601 secs\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(clf)"
|
||||
"t = time.time()\n",
|
||||
"clf2 = LinearSVC(C=.01, random_state=random_state)\n",
|
||||
"clf2.fit(X, y)\n",
|
||||
"print(clf2.score(X, y))\n",
|
||||
"print(f\"{time.time() - t:.4f} secs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -152,230 +146,16 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "22884 8\n"
|
||||
"text": "1.0\n18.2638 secs\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a=[22884, 8]\n",
|
||||
"b=max(a)\n",
|
||||
"c=min(a)\n",
|
||||
"print(b,c)"
|
||||
"t = time.time()\n",
|
||||
"clf3 = DecisionTreeClassifier(random_state=random_state)\n",
|
||||
"clf3.fit(X, y)\n",
|
||||
"print(clf3.score(X, y))\n",
|
||||
"print(f\"{time.time() - t:.4f} secs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "0.9996505329372707"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 9
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"b/(b+c)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "(23000, 1)"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 10
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"y.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"k=y[:4,:]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "(4, 1)"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 15
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"k.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "(4,)"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 17
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"k.ravel().ravel().shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "array([[0],\n [0]])"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 20
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"k[[True, False, True, False]]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "(23000, 28)"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 21
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"k = X[(y==1).ravel()]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "(86, 28)"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 29
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"k.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"indices = np.random.random_integers(0, X.shape[0], 2000)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "(2000, 28)"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 39
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X[indices].shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"k=random.shuffle(X)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"k"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "[4, 9, 8, 6, 5, 1]"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 45
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"random.sample(range(10), 6)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@@ -1,6 +1,7 @@
|
||||
import unittest
|
||||
|
||||
from sklearn.datasets import make_classification
|
||||
import os
|
||||
import numpy as np
|
||||
import csv
|
||||
|
||||
@@ -10,12 +11,20 @@ from trees.Stree import Stree, Snode
|
||||
class Snode_test(unittest.TestCase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
os.environ['TESTING'] = '1'
|
||||
self._random_state = 1
|
||||
self._clf = Stree(random_state=self._random_state,
|
||||
use_predictions=True)
|
||||
self._clf.fit(*self._get_Xy())
|
||||
super(Snode_test, self).__init__(*args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
try:
|
||||
os.environ.pop('TESTING')
|
||||
except:
|
||||
pass
|
||||
|
||||
def _get_Xy(self):
|
||||
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
|
||||
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
|
||||
|
@@ -1,6 +1,7 @@
|
||||
import unittest
|
||||
|
||||
from sklearn.datasets import make_classification
|
||||
import os
|
||||
import numpy as np
|
||||
import csv
|
||||
|
||||
@@ -10,12 +11,20 @@ from trees.Stree import Stree, Snode
|
||||
class Stree_test(unittest.TestCase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
os.environ['TESTING'] = '1'
|
||||
self._random_state = 1
|
||||
self._clf = Stree(random_state=self._random_state,
|
||||
use_predictions=False)
|
||||
self._clf.fit(*self._get_Xy())
|
||||
super(Stree_test, self).__init__(*args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
try:
|
||||
os.environ.pop('TESTING')
|
||||
except:
|
||||
pass
|
||||
|
||||
def _get_Xy(self):
|
||||
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
|
||||
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
|
||||
@@ -112,9 +121,11 @@ class Stree_test(unittest.TestCase):
|
||||
self.assertEqual(yp[0], y[0])
|
||||
|
||||
def test_multiple_prediction(self):
|
||||
# First 27 elements the predictions are the same as the truth
|
||||
num = 27
|
||||
X, y = self._get_Xy()
|
||||
yp = self._clf.predict(X[:23, :])
|
||||
self.assertListEqual(y[:23].tolist(), yp.tolist())
|
||||
yp = self._clf.predict(X[:num, :])
|
||||
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
||||
|
||||
def test_score(self):
|
||||
X, y = self._get_Xy()
|
||||
@@ -123,3 +134,26 @@ class Stree_test(unittest.TestCase):
|
||||
right = (yp == y).astype(int)
|
||||
accuracy_computed = sum(right) / len(y)
|
||||
self.assertEqual(accuracy_score, accuracy_computed)
|
||||
|
||||
def test_single_predict_proba(self):
|
||||
# Element 28 has a different prediction than the truth
|
||||
X, y = self._get_Xy()
|
||||
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
|
||||
self.assertEqual(0, yp[0:, 0])
|
||||
self.assertEqual(0.9282970550576184, yp[0:, 1])
|
||||
|
||||
def test_multiple_predict_proba(self):
|
||||
# First 27 elements the predictions are the same as the truth
|
||||
num = 27
|
||||
X, y = self._get_Xy()
|
||||
yp = self._clf.predict_proba(X[:num, :])
|
||||
self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
|
||||
expected_proba = [0.9759887, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.9759887,
|
||||
0.92829706, 0.9759887, 0.9759887, 0.9759887, 0.9759887, 0.92829706,
|
||||
0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.92829706, 0.92829706,
|
||||
0.9759887, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.92829706,
|
||||
0.92829706, 0.92829706, 0.9759887 ]
|
||||
self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())
|
||||
|
||||
|
||||
|
||||
|
@@ -6,6 +6,7 @@ __version__ = "0.9"
|
||||
Node of the Stree (binary tree)
|
||||
'''
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
@@ -17,11 +18,12 @@ class Snode:
|
||||
self._interceptor = 0. if clf is None else clf.intercept_
|
||||
self._title = title
|
||||
self._belief = 0. # belief of the prediction in a leaf node based on samples
|
||||
self._X = X
|
||||
self._X = X if os.environ.get(
|
||||
'TESTING', 'Not Set') != 'Not Set' else None
|
||||
self._y = y
|
||||
self._down = None
|
||||
self._up = None
|
||||
self._class = None # really needed?
|
||||
self._class = None
|
||||
|
||||
def set_down(self, son):
|
||||
self._down = son
|
||||
@@ -42,6 +44,9 @@ class Snode:
|
||||
"""Compute the class of the predictor and its belief based on the subdataset of the node
|
||||
only if it is a leaf
|
||||
"""
|
||||
# Clean memory
|
||||
#self._X = None
|
||||
#self._y = None
|
||||
if not self.is_leaf():
|
||||
return
|
||||
classes, card = np.unique(self._y, return_counts=True)
|
||||
|
@@ -1,3 +1,4 @@
|
||||
# This Python file uses the following encoding: utf-8
|
||||
'''
|
||||
__author__ = "Ricardo Montañana Gómez"
|
||||
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
|
||||
@@ -10,23 +11,37 @@ Uses LinearSVC
|
||||
import numpy as np
|
||||
import typing
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
||||
|
||||
from trees.Snode import Snode
|
||||
|
||||
|
||||
class Stree:
|
||||
class Stree(BaseEstimator, ClassifierMixin):
|
||||
"""
|
||||
"""
|
||||
|
||||
def __init__(self, C=1.0, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False):
|
||||
def __init__(self, C=1.0, max_iter: int=1000, random_state: int=0, use_predictions: bool=False):
|
||||
self._max_iter = max_iter
|
||||
self._C = C
|
||||
self._random_state = random_state
|
||||
self._outcomes = None
|
||||
self._tree = None
|
||||
self.__folder = 'data/'
|
||||
self.__use_predictions = use_predictions
|
||||
self.__trained = False
|
||||
self.__proba = False
|
||||
|
||||
def get_params(self, deep=True):
|
||||
"""Get dict with hyperparameters and its values to accomplish sklearn rules
|
||||
"""
|
||||
return {"C": self._C, "random_state": self._random_state, 'max_iter': self._max_iter}
|
||||
|
||||
def set_params(self, **parameters):
|
||||
"""Set hyperparmeters as specified by sklearn, needed in Gridsearchs
|
||||
"""
|
||||
for parameter, value in parameters.items():
|
||||
setattr(self, parameter, value)
|
||||
return self
|
||||
|
||||
def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list:
|
||||
if self.__use_predictions:
|
||||
@@ -47,6 +62,8 @@ class Stree:
|
||||
return [X_up, y_up, X_down, y_down]
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
|
||||
X, y = check_X_y(X, y)
|
||||
self.n_features_in_ = X.shape[1]
|
||||
self._tree = self.train(X, y.ravel(), title)
|
||||
self._build_predictor()
|
||||
self.__trained = True
|
||||
@@ -83,16 +100,31 @@ class Stree:
|
||||
def predict(self, X: np.array) -> np.array:
|
||||
def predict_class(xp: np.array, tree: Snode) -> np.array:
|
||||
if tree.is_leaf():
|
||||
return tree._class
|
||||
if self.__proba:
|
||||
return [tree._class, tree._belief]
|
||||
else:
|
||||
return tree._class
|
||||
coef = tree._vector[0, :].reshape(-1, xp.shape[1])
|
||||
if xp.dot(coef.T) + tree._interceptor[0] > 0:
|
||||
return predict_class(xp, tree.get_down())
|
||||
return predict_class(xp, tree.get_up())
|
||||
|
||||
# sklearn check
|
||||
check_is_fitted(self)
|
||||
# Input validation
|
||||
X = check_array(X)
|
||||
# setup prediction & make it happen
|
||||
y = np.array([], dtype=int)
|
||||
for xp in X:
|
||||
y = np.append(y, predict_class(xp.reshape(-1, X.shape[1]), self._tree))
|
||||
return y
|
||||
|
||||
def predict_proba(self, X: np.array) -> np.array:
|
||||
self.__proba = True
|
||||
result = self.predict(X).reshape(X.shape[0], 2)
|
||||
self.__proba = False
|
||||
return result
|
||||
|
||||
def score(self, X: np.array, y: np.array, print_out=True) -> float:
|
||||
if not self.__trained:
|
||||
self.fit(X, y)
|
||||
|
Reference in New Issue
Block a user