mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-15 23:46:02 +00:00
Add C param in constructor and creditcard dataset
This commit is contained in:
44
main.py
44
main.py
@@ -5,8 +5,42 @@ random_state = 1
|
|||||||
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
|
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
|
||||||
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
|
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
|
||||||
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=random_state)
|
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=random_state)
|
||||||
model = Stree(random_state=random_state)
|
|
||||||
model.fit(X, y)
|
def load_creditcard(n_examples=0):
|
||||||
print(model)
|
import pandas as pd
|
||||||
model.save_sub_datasets()
|
import numpy as np
|
||||||
print(f"Prediciting [{y[0]}] we have {model.predict(X[0, :].reshape(-1, X.shape[1]))}")
|
import random
|
||||||
|
df = pd.read_csv('data/creditcard.csv')
|
||||||
|
print("Fraud: {0:.3f}% {1}".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))
|
||||||
|
print("Valid: {0:.3f}% {1}".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))
|
||||||
|
y = np.expand_dims(df.Class.values, axis=1)
|
||||||
|
X = df.drop(['Class', 'Time', 'Amount'], axis=1).values
|
||||||
|
#Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)
|
||||||
|
#return Xtrain, Xtest, ytrain, ytest
|
||||||
|
if n_examples > 0:
|
||||||
|
# Take first n_examples samples
|
||||||
|
X = X[:n_examples, :]
|
||||||
|
y = y[:n_examples, :]
|
||||||
|
else:
|
||||||
|
# Take all the positive samples with a number of random negatives
|
||||||
|
if n_examples < 0:
|
||||||
|
Xt = X[(y == 1).ravel()]
|
||||||
|
yt = y[(y == 1).ravel()]
|
||||||
|
indices = random.sample(range(X.shape[0]), -1 * n_examples)
|
||||||
|
X = np.append(Xt, X[indices], axis=0)
|
||||||
|
y = np.append(yt, y[indices], axis=0)
|
||||||
|
print("X.shape", X.shape, " y.shape", y.shape)
|
||||||
|
print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
|
||||||
|
print("Valid: {0:.3f}% {1}".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))
|
||||||
|
return X, y
|
||||||
|
#X, y = load_creditcard(-5000)
|
||||||
|
#X, y = load_creditcard(0)
|
||||||
|
|
||||||
|
clf = Stree(C=.01, max_iter=100, random_state=random_state)
|
||||||
|
clf.fit(X, y)
|
||||||
|
print(clf)
|
||||||
|
clf.show_tree()
|
||||||
|
clf.save_sub_datasets()
|
||||||
|
print(f"Predicting {y[0]} we have {clf.predict(X[0, :].reshape(-1, X.shape[1]))}")
|
||||||
|
print(f"Classifier's accuracy: {clf.score(X, y, print_out=False):.4f}")
|
||||||
|
clf.show_tree(only_leaves=True)
|
||||||
|
381
test.ipynb
381
test.ipynb
@@ -1,22 +1,5 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import numpy as np \n",
|
|
||||||
"from sklearn.svm import LinearSVC\n",
|
|
||||||
"from sklearn.datasets import make_classification\n",
|
|
||||||
"from trees.Stree import Stree\n",
|
|
||||||
"\n",
|
|
||||||
"random_state = 1\n",
|
|
||||||
"X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n",
|
|
||||||
" n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,\n",
|
|
||||||
" class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=random_state)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 2,
|
||||||
@@ -25,14 +8,374 @@
|
|||||||
{
|
{
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"text": "Accuracy: 0.950667\n"
|
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nCPU times: user 2 µs, sys: 0 ns, total: 2 µs\nWall time: 5.96 µs\n"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from sklearn.svm import LinearSVC\n",
|
||||||
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||||
|
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
|
||||||
|
"from trees.Stree import Stree\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def load_creditcard(n_examples=0):\n",
|
||||||
|
" df = pd.read_csv('data/creditcard.csv')\n",
|
||||||
|
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
||||||
|
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
|
||||||
|
" y = np.expand_dims(df.Class.values, axis=1)\n",
|
||||||
|
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
|
||||||
|
" #Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
||||||
|
" #return Xtrain, Xtest, ytrain, ytest\n",
|
||||||
|
" if n_examples > 0:\n",
|
||||||
|
" X = X[:n_examples, :]\n",
|
||||||
|
" y = y[:n_examples, :]\n",
|
||||||
|
" else:\n",
|
||||||
|
" if n_examples < 0:\n",
|
||||||
|
" Xt = X[(y == 1).ravel()]\n",
|
||||||
|
" yt = y[(y == 1).ravel()]\n",
|
||||||
|
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
|
||||||
|
" X = np.append(Xt, X[indices], axis=0)\n",
|
||||||
|
" y = np.append(yt, y[indices], axis=0)\n",
|
||||||
|
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
|
||||||
|
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
||||||
|
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))\n",
|
||||||
|
" return X, y\n",
|
||||||
|
"\n",
|
||||||
|
"random_state = 1\n",
|
||||||
|
"\n",
|
||||||
|
"# Datasets\n",
|
||||||
|
"\n",
|
||||||
|
"#X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n",
|
||||||
|
"# n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,\n",
|
||||||
|
"# class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=random_state)\n",
|
||||||
|
"\n",
|
||||||
|
"#X, y = load_wine(return_X_y=True)\n",
|
||||||
|
"#X, y = load_iris(return_X_y=True)\n",
|
||||||
|
"\n",
|
||||||
|
"X, y = load_creditcard(23000)\n",
|
||||||
|
"%time"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "CPU times: user 3 µs, sys: 0 ns, total: 3 µs\nWall time: 5.01 µs\nAccuracy: 0.999609\nroot\nroot - Down(array([0, 1]), array([24, 63]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Down(array([0, 1]), array([ 1, 61])), classes=[0 1], items<0>=1, items<1>=61, <couldn't go any further> LEAF accuracy=0.98, belief=61.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Up(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1]))\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Up(array([0]), array([23])), class=[0], items=23, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Down(array([1]), array([8])), class=[1], items=8, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Up(array([0]), array([3])), class=[0], items=3, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Down(array([1]), array([2])), class=[1], items=2, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Up(array([0]), array([2])), class=[0], items=2, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Down(array([1]), array([4])), class=[1], items=4, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Up(array([0]), array([1])), class=[0], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Up(array([0, 1]), array([22884, 8])), classes=[0 1], items<0>=22884, items<1>=8, <couldn't go any further> LEAF accuracy=1.00, belief=2860.50 class=[0]\n\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%time\n",
|
||||||
"clf = Stree(random_state=random_state, use_predictions=False)\n",
|
"clf = Stree(random_state=random_state, use_predictions=False)\n",
|
||||||
"clf.fit(X, y)\n",
|
"clf.fit(X, y)\n",
|
||||||
"accuracy = clf.score(X, y)"
|
"clf.score(X, y)\n",
|
||||||
|
"print(clf)"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "CPU times: user 4 µs, sys: 5 µs, total: 9 µs\nWall time: 12.2 µs\n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": "0.9979565217391304"
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 4
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%time\n",
|
||||||
|
"clf2 = LinearSVC(random_state=random_state)\n",
|
||||||
|
"clf2.fit(X, y)\n",
|
||||||
|
"clf2.score(X, y)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "CPU times: user 13 µs, sys: 5 µs, total: 18 µs\nWall time: 7.87 µs\n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": "1.0"
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 5
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%time\n",
|
||||||
|
"clf3 = DecisionTreeClassifier(random_state=random_state)\n",
|
||||||
|
"clf3.fit(X, y)\n",
|
||||||
|
"clf3.score(X, y)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "root\nroot - Down(array([0, 1]), array([24, 63]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62]))\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Down(array([0, 1]), array([ 1, 61])), classes=[0 1], items<0>=1, items<1>=61, <couldn't go any further> LEAF accuracy=0.98, belief=61.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Down(array([0, 1]), array([ 1, 62])) - Up(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1]))\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Down(array([0, 1]), array([24, 63])) - Up(array([0, 1]), array([23, 1])) - Up(array([0]), array([23])), class=[0], items=23, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8]))\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Down(array([1]), array([8])), class=[1], items=8, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Down(array([0, 1]), array([3, 8])) - Up(array([0]), array([3])), class=[0], items=3, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Down(array([1]), array([2])), class=[1], items=2, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Down(array([0, 1]), array([2, 2])) - Up(array([0]), array([2])), class=[0], items=2, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Down(array([1]), array([4])), class=[1], items=4, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Down(array([0, 1]), array([1, 4])) - Up(array([0]), array([1])), class=[0], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[0]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9]))\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Down(array([1]), array([1])), class=[1], items=1, rest=0, <pure> LEAF accuracy=1.00, belief=1.00 class=[1]\nroot - Up(array([0, 1]), array([22890, 23])) - Up(array([0, 1]), array([22887, 15])) - Up(array([0, 1]), array([22885, 13])) - Up(array([0, 1]), array([22884, 9])) - Up(array([0, 1]), array([22884, 8])), classes=[0 1], items<0>=22884, items<1>=8, <couldn't go any further> LEAF accuracy=1.00, belief=2860.50 class=[0]\n\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(clf)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "22884 8\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"a=[22884, 8]\n",
|
||||||
|
"b=max(a)\n",
|
||||||
|
"c=min(a)\n",
|
||||||
|
"print(b,c)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": "0.9996505329372707"
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 9
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"b/(b+c)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": "(23000, 1)"
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 10
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"y.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"k=y[:4,:]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": "(4, 1)"
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 15
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"k.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": "(4,)"
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 17
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"k.ravel().ravel().shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": "array([[0],\n [0]])"
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 20
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"k[[True, False, True, False]]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": "(23000, 28)"
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 21
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"X.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 28,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"k = X[(y==1).ravel()]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 29,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": "(86, 28)"
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 29
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"k.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 36,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"indices = np.random.random_integers(0, X.shape[0], 2000)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 39,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": "(2000, 28)"
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 39
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"X[indices].shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 42,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import random\n",
|
||||||
|
"k=random.shuffle(X)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 43,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"k"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 45,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": "[4, 9, 8, 6, 5, 1]"
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 45
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"random.sample(range(10), 6)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@@ -26,20 +26,24 @@ class Snode_test(unittest.TestCase):
|
|||||||
"""Check if the attributes in leaves have correct values so they form a predictor
|
"""Check if the attributes in leaves have correct values so they form a predictor
|
||||||
"""
|
"""
|
||||||
def check_leave(node: Snode):
|
def check_leave(node: Snode):
|
||||||
if node.is_leaf():
|
if not node.is_leaf():
|
||||||
# Check Belief
|
check_leave(node.get_down())
|
||||||
classes, card = np.unique(node._y, return_counts=True)
|
check_leave(node.get_up())
|
||||||
max_card = max(card)
|
|
||||||
min_card = min(card)
|
|
||||||
try:
|
|
||||||
accuracy = max_card / min_card
|
|
||||||
except:
|
|
||||||
accuracy = 0
|
|
||||||
self.assertEqual(accuracy, node._belief)
|
|
||||||
# Check Class
|
|
||||||
class_computed = classes[card == max_card]
|
|
||||||
self.assertEqual(class_computed, node._class)
|
|
||||||
return
|
return
|
||||||
check_leave(node.get_down())
|
# Check Belief in leave
|
||||||
check_leave(node.get_up())
|
classes, card = np.unique(node._y, return_counts=True)
|
||||||
|
max_card = max(card)
|
||||||
|
min_card = min(card)
|
||||||
|
if len(classes) > 1:
|
||||||
|
try:
|
||||||
|
belief = max_card / (max_card + min_card)
|
||||||
|
except:
|
||||||
|
belief = 0.
|
||||||
|
else:
|
||||||
|
belief = 1
|
||||||
|
self.assertEqual(belief, node._belief)
|
||||||
|
# Check Class
|
||||||
|
class_computed = classes[card == max_card]
|
||||||
|
self.assertEqual(class_computed, node._class)
|
||||||
|
|
||||||
check_leave(self._clf._tree)
|
check_leave(self._clf._tree)
|
||||||
|
@@ -14,9 +14,9 @@ class Snode:
|
|||||||
def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str):
|
def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str):
|
||||||
self._clf = clf
|
self._clf = clf
|
||||||
self._vector = None if clf is None else clf.coef_
|
self._vector = None if clf is None else clf.coef_
|
||||||
self._interceptor = 0 if clf is None else clf.intercept_
|
self._interceptor = 0. if clf is None else clf.intercept_
|
||||||
self._title = title
|
self._title = title
|
||||||
self._belief = 0 # belief of the prediction in a leaf node based on samples
|
self._belief = 0. # belief of the prediction in a leaf node based on samples
|
||||||
self._X = X
|
self._X = X
|
||||||
self._y = y
|
self._y = y
|
||||||
self._down = None
|
self._down = None
|
||||||
@@ -45,21 +45,20 @@ class Snode:
|
|||||||
if not self.is_leaf():
|
if not self.is_leaf():
|
||||||
return
|
return
|
||||||
classes, card = np.unique(self._y, return_counts=True)
|
classes, card = np.unique(self._y, return_counts=True)
|
||||||
max_card = max(card)
|
if len(classes) > 1:
|
||||||
min_card = min(card)
|
max_card = max(card)
|
||||||
try:
|
min_card = min(card)
|
||||||
self._belief = max_card / min_card
|
try:
|
||||||
except:
|
self._belief = max_card / (max_card + min_card)
|
||||||
self._belief = 0
|
except:
|
||||||
self._class = classes[card == max_card]
|
self._belief = 0.
|
||||||
|
self._class = classes[card == max_card][0]
|
||||||
|
else:
|
||||||
|
self._belief = 1
|
||||||
|
self._class = classes[0]
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
if self.is_leaf():
|
if self.is_leaf():
|
||||||
num = 0
|
return f"Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}\n"
|
||||||
for i in np.unique(self._y):
|
|
||||||
num = max(num, self._y[self._y == i].shape[0])
|
|
||||||
den = self._y.shape[0]
|
|
||||||
accuracy = num / den if den != 0 else 1
|
|
||||||
return f"{self._title} LEAF accuracy={accuracy:.2f}, belief={self._belief:.2f} class={self._class}\n"
|
|
||||||
else:
|
else:
|
||||||
return f"{self._title}\n"
|
return f"{self._title}\n"
|
||||||
|
@@ -18,8 +18,9 @@ class Stree:
|
|||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False):
|
def __init__(self, C=1.0, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False):
|
||||||
self._max_iter = max_iter
|
self._max_iter = max_iter
|
||||||
|
self._C = C
|
||||||
self._random_state = random_state
|
self._random_state = random_state
|
||||||
self._outcomes = None
|
self._outcomes = None
|
||||||
self._tree = None
|
self._tree = None
|
||||||
@@ -46,7 +47,7 @@ class Stree:
|
|||||||
return [X_up, y_up, X_down, y_down]
|
return [X_up, y_up, X_down, y_down]
|
||||||
|
|
||||||
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
|
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
|
||||||
self._tree = self.train(X, y, title)
|
self._tree = self.train(X, y.ravel(), title)
|
||||||
self._build_predictor()
|
self._build_predictor()
|
||||||
self.__trained = True
|
self.__trained = True
|
||||||
return self
|
return self
|
||||||
@@ -65,20 +66,18 @@ class Stree:
|
|||||||
def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
|
def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
|
||||||
if np.unique(y).shape[0] == 1:
|
if np.unique(y).shape[0] == 1:
|
||||||
# only 1 class => pure dataset
|
# only 1 class => pure dataset
|
||||||
return Snode(None, X, y, title + f', class={np.unique(y)}, items={y.shape[0]}, rest=0, <pure> ')
|
return Snode(None, X, y, title + ', <pure> ')
|
||||||
# Train the model
|
# Train the model
|
||||||
clf = LinearSVC(max_iter=self._max_iter,
|
clf = LinearSVC(max_iter=self._max_iter, C=self._C,
|
||||||
random_state=self._random_state)
|
random_state=self._random_state)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
tree = Snode(clf, X, y, title)
|
tree = Snode(clf, X, y, title)
|
||||||
X_U, y_u, X_D, y_d = self._split_data(clf, X, y)
|
X_U, y_u, X_D, y_d = self._split_data(clf, X, y)
|
||||||
if X_U is None or X_D is None:
|
if X_U is None or X_D is None:
|
||||||
# didn't part anything
|
# didn't part anything
|
||||||
return Snode(clf, X, y, title + f', classes={np.unique(y)}, items<0>={y[y==0].shape[0]}, items<1>={y[y==1].shape[0]}, <couldn\'t go any further>')
|
return Snode(clf, X, y, title + ', <couldn\'t go any further>')
|
||||||
tree.set_up(self.train(X_U, y_u, title + ' - Up' +
|
tree.set_up(self.train(X_U, y_u, title + ' - Up'))
|
||||||
str(np.unique(y_u, return_counts=True))))
|
tree.set_down(self.train(X_D, y_d, title + ' - Down'))
|
||||||
tree.set_down(self.train(X_D, y_d, title + ' - Down' +
|
|
||||||
str(np.unique(y_d, return_counts=True))))
|
|
||||||
return tree
|
return tree
|
||||||
|
|
||||||
def predict(self, X: np.array) -> np.array:
|
def predict(self, X: np.array) -> np.array:
|
||||||
@@ -95,23 +94,36 @@ class Stree:
|
|||||||
return y
|
return y
|
||||||
|
|
||||||
def score(self, X: np.array, y: np.array, print_out=True) -> float:
|
def score(self, X: np.array, y: np.array, print_out=True) -> float:
|
||||||
self.fit(X, y)
|
if not self.__trained:
|
||||||
yp = self.predict(X)
|
self.fit(X, y)
|
||||||
|
yp = self.predict(X).reshape(y.shape)
|
||||||
right = (yp == y).astype(int)
|
right = (yp == y).astype(int)
|
||||||
accuracy = sum(right) / len(y)
|
accuracy = np.sum(right) / len(y)
|
||||||
if print_out:
|
if print_out:
|
||||||
print(f"Accuracy: {accuracy:.6f}")
|
print(f"Accuracy: {accuracy:.6f}")
|
||||||
return accuracy
|
return accuracy
|
||||||
|
|
||||||
def __str__(self):
|
def __print_tree(self, tree: Snode, only_leaves=False) -> str:
|
||||||
def print_tree(tree: Snode) -> str:
|
if not only_leaves:
|
||||||
output = str(tree)
|
output = str(tree)
|
||||||
if tree.is_leaf():
|
else:
|
||||||
return output
|
output = ''
|
||||||
output += print_tree(tree.get_down())
|
if tree.is_leaf():
|
||||||
output += print_tree(tree.get_up())
|
if only_leaves:
|
||||||
|
output = str(tree)
|
||||||
return output
|
return output
|
||||||
return print_tree(self._tree)
|
output += self.__print_tree(tree.get_down(), only_leaves)
|
||||||
|
output += self.__print_tree(tree.get_up(), only_leaves)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def show_tree(self, only_leaves=False):
|
||||||
|
if only_leaves:
|
||||||
|
print(self.__print_tree(self._tree, only_leaves=True))
|
||||||
|
else:
|
||||||
|
print(self)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.__print_tree(self._tree)
|
||||||
|
|
||||||
def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int):
|
def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int):
|
||||||
"""Save the dataset of the node in a csv file
|
"""Save the dataset of the node in a csv file
|
||||||
|
Reference in New Issue
Block a user