mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-15 15:36:00 +00:00
Removed (by now) predict_proba. Created a notebook in jupyter Added split_criteria parameter with min_distance and max_samples values Refactor _distances Refactor _split_criteria Refactor _reorder_results
78 lines
2.2 KiB
Python
78 lines
2.2 KiB
Python
import time
|
|
from sklearn.model_selection import train_test_split
|
|
from stree import Stree
|
|
|
|
random_state = 1
|
|
|
|
|
|
def load_creditcard(n_examples=0):
|
|
import pandas as pd
|
|
import numpy as np
|
|
import random
|
|
|
|
df = pd.read_csv("data/creditcard.csv")
|
|
print(
|
|
"Fraud: {0:.3f}% {1}".format(
|
|
df.Class[df.Class == 1].count() * 100 / df.shape[0],
|
|
df.Class[df.Class == 1].count(),
|
|
)
|
|
)
|
|
print(
|
|
"Valid: {0:.3f}% {1}".format(
|
|
df.Class[df.Class == 0].count() * 100 / df.shape[0],
|
|
df.Class[df.Class == 0].count(),
|
|
)
|
|
)
|
|
y = np.expand_dims(df.Class.values, axis=1)
|
|
X = df.drop(["Class", "Time", "Amount"], axis=1).values
|
|
if n_examples > 0:
|
|
# Take first n_examples samples
|
|
X = X[:n_examples, :]
|
|
y = y[:n_examples, :]
|
|
else:
|
|
# Take all the positive samples with a number of random negatives
|
|
if n_examples < 0:
|
|
Xt = X[(y == 1).ravel()]
|
|
yt = y[(y == 1).ravel()]
|
|
indices = random.sample(range(X.shape[0]), -1 * n_examples)
|
|
X = np.append(Xt, X[indices], axis=0)
|
|
y = np.append(yt, y[indices], axis=0)
|
|
print("X.shape", X.shape, " y.shape", y.shape)
|
|
print(
|
|
"Fraud: {0:.3f}% {1}".format(
|
|
len(y[y == 1]) * 100 / X.shape[0], len(y[y == 1])
|
|
)
|
|
)
|
|
print(
|
|
"Valid: {0:.3f}% {1}".format(
|
|
len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])
|
|
)
|
|
)
|
|
Xtrain, Xtest, ytrain, ytest = train_test_split(
|
|
X,
|
|
y,
|
|
train_size=0.7,
|
|
shuffle=True,
|
|
random_state=random_state,
|
|
stratify=y,
|
|
)
|
|
return Xtrain, Xtest, ytrain, ytest
|
|
|
|
|
|
# data = load_creditcard(-5000) # Take all true samples + 5000 of the others
|
|
# data = load_creditcard(5000) # Take the first 5000 samples
|
|
data = load_creditcard() # Take all the samples
|
|
|
|
Xtrain = data[0]
|
|
Xtest = data[1]
|
|
ytrain = data[2]
|
|
ytest = data[3]
|
|
|
|
now = time.time()
|
|
clf = Stree(C=0.01, random_state=random_state)
|
|
clf.fit(Xtrain, ytrain)
|
|
print(f"Took {time.time() - now:.2f} seconds to train")
|
|
print(clf)
|
|
print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}")
|
|
print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
|