mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-16 07:56:06 +00:00
Implement split data with or without using predictions & some tests
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@@ -127,3 +127,8 @@ dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
.idea
|
||||
data/*
|
||||
|
||||
.vscode
|
12
.vscode/settings.json
vendored
12
.vscode/settings.json
vendored
@@ -1,12 +0,0 @@
|
||||
{
|
||||
"python.testing.unittestArgs": [
|
||||
"-v",
|
||||
"-s",
|
||||
"./tests",
|
||||
"-p",
|
||||
"*_test.py"
|
||||
],
|
||||
"python.testing.pytestEnabled": false,
|
||||
"python.testing.nosetestsEnabled": false,
|
||||
"python.testing.unittestEnabled": true
|
||||
}
|
1
main.py
1
main.py
@@ -8,3 +8,4 @@ X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
|
||||
model = Stree(random_state=random_state)
|
||||
model.fit(X, y)
|
||||
model.show_outcomes()
|
||||
model.save_sub_datasets()
|
128
test.ipynb
Normal file
128
test.ipynb
Normal file
@@ -0,0 +1,128 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np \n",
|
||||
"from sklearn.svm import LinearSVC\n",
|
||||
"from sklearn.datasets import make_classification\n",
|
||||
"\n",
|
||||
"random_state = 1\n",
|
||||
"X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n",
|
||||
" n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,\n",
|
||||
" class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=random_state)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "data/dataset1.csv - root\ndata/dataset2.csv - root - Down\ndata/dataset3.csv - root - Down - Down, classes=[0 1], items<0>=17, items<1>=691, <couldn't go any further> LEAF accuracy=0.98\ndata/dataset4.csv - root - Down - Up\ndata/dataset5.csv - root - Down - Up - Down, classes=[0 1], items<0>=1, items<1>=3, <couldn't go any further> LEAF accuracy=0.75\ndata/dataset6.csv - root - Down - Up - Up, class=[0], items=7, rest=0, <pure> LEAF accuracy=1.00\ndata/dataset3.csv - root - Up, classes=[0 1], items<0>=725, items<1>=56, <couldn't go any further> LEAF accuracy=0.93\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!cat data/catalog.txt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def readsub(name):\n",
|
||||
" data = np.genfromtxt(name, delimiter=',')\n",
|
||||
" data = np.array(data)\n",
|
||||
" py = data[:, data.shape[1] - 1]\n",
|
||||
" px = np.delete(data, data.shape[1] - 1, axis=1)\n",
|
||||
" return px, py\n",
|
||||
"def localiza(X, px):\n",
|
||||
" enc = False\n",
|
||||
" for i in range(X.shape[0]):\n",
|
||||
" if all(X[i, :] == px):\n",
|
||||
" enc = True\n",
|
||||
" print(f\" i={i} - X[{i}, :]={X[i, :]} - px={px} - y[{i}]={y[i]}\")\n",
|
||||
" print(\"Encontrado:\", enc)\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"px, py = readsub('data/dataset5.csv')\n",
|
||||
"model = LinearSVC(random_state=1, max_iter=1000)\n",
|
||||
"model.fit(px,py)\n",
|
||||
"yp = model.predict(px)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "[1. 1. 1. 1.]\n[1. 1. 0. 1.]\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(yp)\n",
|
||||
"print(py)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "i=1132 - X[1132, :]=[-0.41453617 -0.38206564 0.54849331] - px=[-0.41453617 -0.38206564 0.54849331] - y[1132]=0\nEncontrado: True\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"localiza(X, px[2, :])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6-final"
|
||||
},
|
||||
"orig_nbformat": 2,
|
||||
"kernelspec": {
|
||||
"name": "python37664bitstreevenva9e4a4efdc1042b6b577bd15fbe145ee",
|
||||
"display_name": "Python 3.7.6 64-bit ('stree': venv)"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@@ -1,14 +1,111 @@
|
||||
import unittest
|
||||
|
||||
from trees.Stree import Stree
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.datasets import make_classification
|
||||
import numpy as np
|
||||
import csv
|
||||
|
||||
from trees.Stree import Stree, Snode
|
||||
|
||||
class Stree_test(unittest.TestCase):
|
||||
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.random_state = 17
|
||||
self._model = Stree(random_state=self.random_state)
|
||||
self._random_state = 1
|
||||
self._model_tree = Stree(random_state=self._random_state, use_predictions=True)
|
||||
self._model_tree.fit(*self._get_Xy())
|
||||
self._model_svm = LinearSVC(random_state=self._random_state, max_iter=self._model_tree._max_iter)
|
||||
super(Stree_test, self).__init__(*args, **kwargs)
|
||||
|
||||
def _get_Xy(self):
|
||||
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
|
||||
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
|
||||
class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=self._random_state)
|
||||
return X, y
|
||||
|
||||
def test_split_data(self):
|
||||
self.assertTrue(True)
|
||||
|
||||
def _check_tree(self, node: Snode):
|
||||
if node.is_leaf():
|
||||
return
|
||||
self._model_svm.fit(node._X, node._y)
|
||||
y_prediction = self._model_svm.predict(node._X)
|
||||
y_down = node.get_down()._y
|
||||
y_up = node.get_up()._y
|
||||
# Is a correct partition in terms of cadinality?
|
||||
# i.e. The partition algorithm didn't forget any sample
|
||||
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
||||
unique_y, count_y = np.unique(node._y, return_counts=True)
|
||||
_, count_d = np.unique(y_down, return_counts=True)
|
||||
_, count_u = np.unique(y_up, return_counts=True)
|
||||
for i in unique_y:
|
||||
try:
|
||||
number_down = count_d[i]
|
||||
except:
|
||||
number_down = 0
|
||||
try:
|
||||
number_up = count_u[i]
|
||||
except:
|
||||
number_up = 0
|
||||
self.assertEqual(count_y[i], number_down + number_up)
|
||||
# Is the partition made the same as the prediction?
|
||||
# as the node is not a leaf...
|
||||
unique_yp, count_yp = np.unique(y_prediction, return_counts=True)
|
||||
self.assertEqual(count_yp[1], y_down.shape[0])
|
||||
self.assertEqual(count_yp[0], y_up.shape[0])
|
||||
self._check_tree(node.get_down())
|
||||
self._check_tree(node.get_up())
|
||||
|
||||
def test_build_tree(self):
|
||||
"""Check if the tree is built the same way as predictions of models
|
||||
"""
|
||||
self._check_tree(self._model_tree._tree)
|
||||
|
||||
def _get_file_data(self, file_name: str) -> tuple:
|
||||
"""Return X, y from data, y is the last column in array
|
||||
|
||||
Arguments:
|
||||
file_name {str} -- the file name
|
||||
|
||||
Returns:
|
||||
tuple -- tuple with samples, categories
|
||||
"""
|
||||
data = np.genfromtxt(file_name, delimiter=',')
|
||||
data = np.array(data)
|
||||
column_y = data.shape[1] - 1
|
||||
fy = data[:, column_y]
|
||||
fx = np.delete(data, column_y, axis=1)
|
||||
return fx, fy
|
||||
|
||||
def _find_out(self, px: np.array, x_original: np.array, y_original) -> list:
|
||||
"""Find the original values of y for a given array of samples
|
||||
|
||||
Arguments:
|
||||
px {np.array} -- array of samples to search for
|
||||
x_original {np.array} -- original dataset
|
||||
y_original {[type]} -- original classes
|
||||
|
||||
Returns:
|
||||
np.array -- classes of the given samples
|
||||
"""
|
||||
res = []
|
||||
for needle in px:
|
||||
for row in range(x_original.shape[0]):
|
||||
if all(x_original[row, :] == needle):
|
||||
res.append(y_original[row])
|
||||
return res
|
||||
|
||||
def test_subdatasets(self):
|
||||
"""Check if the subdatasets files have the same predictions as the tree itself
|
||||
"""
|
||||
model = LinearSVC(random_state=self._random_state, max_iter=self._model_tree._max_iter)
|
||||
X, y = self._get_Xy()
|
||||
model.fit(X, y)
|
||||
self._model_tree.save_sub_datasets()
|
||||
with open(self._model_tree.get_catalog_name()) as cat_file:
|
||||
catalog = csv.reader(cat_file, delimiter=',')
|
||||
for row in catalog:
|
||||
X, y = self._get_Xy()
|
||||
x_file, y_file = self._get_file_data(row[0])
|
||||
y_original = np.array(self._find_out(x_file, X, y), dtype=int)
|
||||
self.assertTrue(np.array_equal(y_file, y_original))
|
||||
|
@@ -17,7 +17,7 @@ class Snode:
|
||||
self._y = y
|
||||
self._down = None
|
||||
self._up = None
|
||||
self._class = None
|
||||
self._class = None # really needed?
|
||||
|
||||
def set_down(self, son):
|
||||
self._down = son
|
||||
@@ -28,13 +28,13 @@ class Snode:
|
||||
def is_leaf(self,) -> bool:
|
||||
return self._up is None and self._down is None
|
||||
|
||||
def get_down(self):
|
||||
def get_down(self) -> 'Snode':
|
||||
return self._down
|
||||
|
||||
def get_up(self):
|
||||
def get_up(self) -> 'Snode':
|
||||
return self._up
|
||||
|
||||
def __str__(self):
|
||||
def __str__(self) -> str:
|
||||
if self.is_leaf():
|
||||
num = 0
|
||||
for i in np.unique(self._y):
|
||||
|
@@ -8,6 +8,7 @@ Uses LinearSVC
|
||||
'''
|
||||
|
||||
import numpy as np
|
||||
import typing
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
from trees.Snode import Snode
|
||||
@@ -15,45 +16,50 @@ from trees.Snode import Snode
|
||||
class Stree:
|
||||
"""
|
||||
"""
|
||||
def __init__(self, max_iter: int=1000, random_state: int=0):
|
||||
def __init__(self, max_iter: int=1000, random_state: int=0, use_predictions: bool=False):
|
||||
self._max_iter = max_iter
|
||||
self._random_state = random_state
|
||||
self._outcomes = None
|
||||
self._tree = None
|
||||
self.__folder = 'data/'
|
||||
self.__use_predictions = use_predictions
|
||||
|
||||
def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list:
|
||||
# doesn't work with multiclass as each sample has to do inner product with its own coeficients
|
||||
# computes positition of every sample is w.r.t. the hyperplane
|
||||
coef = clf.coef_[0, :].reshape(-1, X.shape[1])
|
||||
intercept = clf.intercept_[0]
|
||||
res = X.dot(coef.T) + intercept
|
||||
down = res > 0
|
||||
if self.__use_predictions:
|
||||
yp = clf.predict(X)
|
||||
down = (yp == 1).reshape(-1, 1)
|
||||
else:
|
||||
# doesn't work with multiclass as each sample has to do inner product with its own coeficients
|
||||
# computes positition of every sample is w.r.t. the hyperplane
|
||||
coef = clf.coef_[0, :].reshape(-1, X.shape[1])
|
||||
intercept = clf.intercept_[0]
|
||||
res = X.dot(coef.T) + intercept
|
||||
down = res > 0
|
||||
up = ~down
|
||||
X_down = X[down[:, 0]] if any(down) else None
|
||||
y_down = y[down[:, 0]] if any(down) else None
|
||||
X_up = X[up[:, 0]] if any(up) else None
|
||||
y_up = y[up[:, 0]] if any(up) else None
|
||||
return X_up, y_up, X_down, y_down
|
||||
return [X_up, y_up, X_down, y_down]
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> list:
|
||||
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
|
||||
self._tree = self.train(X, y, title)
|
||||
return self
|
||||
|
||||
def train(self: Snode, X: np.ndarray, y: np.ndarray, title: str='') -> list:
|
||||
|
||||
def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
|
||||
if np.unique(y).shape[0] == 1:
|
||||
# onlyt 1 class => pure dataset
|
||||
return Snode(np.array([]), 0, X, y, title + f', <pure> class={np.unique(y)} items={y.shape[0]}')
|
||||
# only 1 class => pure dataset
|
||||
return Snode(np.array([]), 0, X, y, title + f', class={np.unique(y)}, items={y.shape[0]}, rest=0, <pure> ')
|
||||
# Train the model
|
||||
clf = LinearSVC(max_iter=self._max_iter, random_state=self._random_state)
|
||||
clf.fit(X, y)
|
||||
tree = Snode(clf.coef_, clf.intercept_, X, y, title)
|
||||
#plot_hyperplane(clf, X, y, title)
|
||||
X_T, y_t, X_O, y_o = self._split_data(clf, X, y)
|
||||
if X_T is None or X_O is None:
|
||||
X_U, y_u, X_D, y_d = self._split_data(clf, X, y)
|
||||
if X_U is None or X_D is None:
|
||||
# didn't part anything
|
||||
return Snode(clf.coef_, clf.intercept_, X, y, title + f', <couldn\'t go any further> classes={np.unique(y)} items<0>={y[y==0].shape[0]} items<1>={y[y==1].shape[0]}')
|
||||
tree.set_up( self.train(X_T, y_t, title + ' - Up'))
|
||||
tree.set_down(self.train(X_O, y_o, title + ' - Down'))
|
||||
return Snode(clf.coef_, clf.intercept_, X, y, title + f', classes={np.unique(y)}, items<0>={y[y==0].shape[0]}, items<1>={y[y==1].shape[0]}, <couldn\'t go any further>')
|
||||
tree.set_up(self.train(X_U, y_u, title + ' - Up' + str(np.unique(y_u, return_counts=True))))
|
||||
tree.set_down(self.train(X_D, y_d, title + ' - Down' + str(np.unique(y_d, return_counts=True))))
|
||||
return tree
|
||||
|
||||
def _print_tree(self, tree: Snode):
|
||||
@@ -67,4 +73,30 @@ class Stree:
|
||||
pointer = self._tree
|
||||
self._print_tree(pointer)
|
||||
|
||||
def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int):
|
||||
"""Save the dataset of the node in a csv file
|
||||
|
||||
Arguments:
|
||||
tree {Snode} -- node with data to save
|
||||
number {int} -- a number to make different file names
|
||||
"""
|
||||
data = np.append(tree._X, tree._y.reshape(-1,1), axis=1)
|
||||
name = f"{self.__folder}dataset{number}.csv"
|
||||
np.savetxt(name, data, delimiter=",")
|
||||
catalog.write(f"{name}, - {str(tree)}\n")
|
||||
if tree.is_leaf():
|
||||
return
|
||||
self._save_datasets(tree.get_down(), catalog, number + 1)
|
||||
self._save_datasets(tree.get_up(), catalog, number + 2)
|
||||
|
||||
def get_catalog_name(self):
|
||||
return self.__folder + "catalog.txt"
|
||||
|
||||
def save_sub_datasets(self):
|
||||
"""Save the every dataset stored in the tree to check with manual classifier
|
||||
"""
|
||||
pointer = self._tree
|
||||
with open(self.get_catalog_name(), 'w', encoding = 'utf-8') as catalog:
|
||||
self._save_datasets(pointer, catalog, 1)
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user