Implement split data with or without using predictions & some tests

This commit is contained in:
2020-05-12 17:36:16 +02:00
parent b9ea24696f
commit 371257c121
7 changed files with 290 additions and 39 deletions

5
.gitignore vendored
View File

@@ -127,3 +127,8 @@ dmypy.json
# Pyre type checker # Pyre type checker
.pyre/ .pyre/
.idea
data/*
.vscode

12
.vscode/settings.json vendored
View File

@@ -1,12 +0,0 @@
{
"python.testing.unittestArgs": [
"-v",
"-s",
"./tests",
"-p",
"*_test.py"
],
"python.testing.pytestEnabled": false,
"python.testing.nosetestsEnabled": false,
"python.testing.unittestEnabled": true
}

View File

@@ -8,3 +8,4 @@ X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
model = Stree(random_state=random_state) model = Stree(random_state=random_state)
model.fit(X, y) model.fit(X, y)
model.show_outcomes() model.show_outcomes()
model.save_sub_datasets()

128
test.ipynb Normal file
View File

@@ -0,0 +1,128 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np \n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.datasets import make_classification\n",
"\n",
"random_state = 1\n",
"X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n",
" n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,\n",
" class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=random_state)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "data/dataset1.csv - root\ndata/dataset2.csv - root - Down\ndata/dataset3.csv - root - Down - Down, classes=[0 1], items<0>=17, items<1>=691, <couldn't go any further> LEAF accuracy=0.98\ndata/dataset4.csv - root - Down - Up\ndata/dataset5.csv - root - Down - Up - Down, classes=[0 1], items<0>=1, items<1>=3, <couldn't go any further> LEAF accuracy=0.75\ndata/dataset6.csv - root - Down - Up - Up, class=[0], items=7, rest=0, <pure> LEAF accuracy=1.00\ndata/dataset3.csv - root - Up, classes=[0 1], items<0>=725, items<1>=56, <couldn't go any further> LEAF accuracy=0.93\n"
}
],
"source": [
"!cat data/catalog.txt"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def readsub(name):\n",
" data = np.genfromtxt(name, delimiter=',')\n",
" data = np.array(data)\n",
" py = data[:, data.shape[1] - 1]\n",
" px = np.delete(data, data.shape[1] - 1, axis=1)\n",
" return px, py\n",
"def localiza(X, px):\n",
" enc = False\n",
" for i in range(X.shape[0]):\n",
" if all(X[i, :] == px):\n",
" enc = True\n",
" print(f\" i={i} - X[{i}, :]={X[i, :]} - px={px} - y[{i}]={y[i]}\")\n",
" print(\"Encontrado:\", enc)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"px, py = readsub('data/dataset5.csv')\n",
"model = LinearSVC(random_state=1, max_iter=1000)\n",
"model.fit(px,py)\n",
"yp = model.predict(px)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "[1. 1. 1. 1.]\n[1. 1. 0. 1.]\n"
}
],
"source": [
"print(yp)\n",
"print(py)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "i=1132 - X[1132, :]=[-0.41453617 -0.38206564 0.54849331] - px=[-0.41453617 -0.38206564 0.54849331] - y[1132]=0\nEncontrado: True\n"
}
],
"source": [
"localiza(X, px[2, :])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6-final"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python37664bitstreevenva9e4a4efdc1042b6b577bd15fbe145ee",
"display_name": "Python 3.7.6 64-bit ('stree': venv)"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,14 +1,111 @@
import unittest import unittest
from trees.Stree import Stree from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
import numpy as np
import csv
from trees.Stree import Stree, Snode
class Stree_test(unittest.TestCase): class Stree_test(unittest.TestCase):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self.random_state = 17 self._random_state = 1
self._model = Stree(random_state=self.random_state) self._model_tree = Stree(random_state=self._random_state, use_predictions=True)
self._model_tree.fit(*self._get_Xy())
self._model_svm = LinearSVC(random_state=self._random_state, max_iter=self._model_tree._max_iter)
super(Stree_test, self).__init__(*args, **kwargs) super(Stree_test, self).__init__(*args, **kwargs)
def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=self._random_state)
return X, y
def test_split_data(self): def test_split_data(self):
self.assertTrue(True) self.assertTrue(True)
def _check_tree(self, node: Snode):
if node.is_leaf():
return
self._model_svm.fit(node._X, node._y)
y_prediction = self._model_svm.predict(node._X)
y_down = node.get_down()._y
y_up = node.get_up()._y
# Is a correct partition in terms of cadinality?
# i.e. The partition algorithm didn't forget any sample
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
unique_y, count_y = np.unique(node._y, return_counts=True)
_, count_d = np.unique(y_down, return_counts=True)
_, count_u = np.unique(y_up, return_counts=True)
for i in unique_y:
try:
number_down = count_d[i]
except:
number_down = 0
try:
number_up = count_u[i]
except:
number_up = 0
self.assertEqual(count_y[i], number_down + number_up)
# Is the partition made the same as the prediction?
# as the node is not a leaf...
unique_yp, count_yp = np.unique(y_prediction, return_counts=True)
self.assertEqual(count_yp[1], y_down.shape[0])
self.assertEqual(count_yp[0], y_up.shape[0])
self._check_tree(node.get_down())
self._check_tree(node.get_up())
def test_build_tree(self):
"""Check if the tree is built the same way as predictions of models
"""
self._check_tree(self._model_tree._tree)
def _get_file_data(self, file_name: str) -> tuple:
"""Return X, y from data, y is the last column in array
Arguments:
file_name {str} -- the file name
Returns:
tuple -- tuple with samples, categories
"""
data = np.genfromtxt(file_name, delimiter=',')
data = np.array(data)
column_y = data.shape[1] - 1
fy = data[:, column_y]
fx = np.delete(data, column_y, axis=1)
return fx, fy
def _find_out(self, px: np.array, x_original: np.array, y_original) -> list:
"""Find the original values of y for a given array of samples
Arguments:
px {np.array} -- array of samples to search for
x_original {np.array} -- original dataset
y_original {[type]} -- original classes
Returns:
np.array -- classes of the given samples
"""
res = []
for needle in px:
for row in range(x_original.shape[0]):
if all(x_original[row, :] == needle):
res.append(y_original[row])
return res
def test_subdatasets(self):
"""Check if the subdatasets files have the same predictions as the tree itself
"""
model = LinearSVC(random_state=self._random_state, max_iter=self._model_tree._max_iter)
X, y = self._get_Xy()
model.fit(X, y)
self._model_tree.save_sub_datasets()
with open(self._model_tree.get_catalog_name()) as cat_file:
catalog = csv.reader(cat_file, delimiter=',')
for row in catalog:
X, y = self._get_Xy()
x_file, y_file = self._get_file_data(row[0])
y_original = np.array(self._find_out(x_file, X, y), dtype=int)
self.assertTrue(np.array_equal(y_file, y_original))

View File

@@ -17,7 +17,7 @@ class Snode:
self._y = y self._y = y
self._down = None self._down = None
self._up = None self._up = None
self._class = None self._class = None # really needed?
def set_down(self, son): def set_down(self, son):
self._down = son self._down = son
@@ -28,13 +28,13 @@ class Snode:
def is_leaf(self,) -> bool: def is_leaf(self,) -> bool:
return self._up is None and self._down is None return self._up is None and self._down is None
def get_down(self): def get_down(self) -> 'Snode':
return self._down return self._down
def get_up(self): def get_up(self) -> 'Snode':
return self._up return self._up
def __str__(self): def __str__(self) -> str:
if self.is_leaf(): if self.is_leaf():
num = 0 num = 0
for i in np.unique(self._y): for i in np.unique(self._y):

View File

@@ -8,6 +8,7 @@ Uses LinearSVC
''' '''
import numpy as np import numpy as np
import typing
from sklearn.svm import LinearSVC from sklearn.svm import LinearSVC
from trees.Snode import Snode from trees.Snode import Snode
@@ -15,45 +16,50 @@ from trees.Snode import Snode
class Stree: class Stree:
""" """
""" """
def __init__(self, max_iter: int=1000, random_state: int=0): def __init__(self, max_iter: int=1000, random_state: int=0, use_predictions: bool=False):
self._max_iter = max_iter self._max_iter = max_iter
self._random_state = random_state self._random_state = random_state
self._outcomes = None self._outcomes = None
self._tree = None self._tree = None
self.__folder = 'data/'
self.__use_predictions = use_predictions
def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list: def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list:
# doesn't work with multiclass as each sample has to do inner product with its own coeficients if self.__use_predictions:
# computes positition of every sample is w.r.t. the hyperplane yp = clf.predict(X)
coef = clf.coef_[0, :].reshape(-1, X.shape[1]) down = (yp == 1).reshape(-1, 1)
intercept = clf.intercept_[0] else:
res = X.dot(coef.T) + intercept # doesn't work with multiclass as each sample has to do inner product with its own coeficients
down = res > 0 # computes positition of every sample is w.r.t. the hyperplane
coef = clf.coef_[0, :].reshape(-1, X.shape[1])
intercept = clf.intercept_[0]
res = X.dot(coef.T) + intercept
down = res > 0
up = ~down up = ~down
X_down = X[down[:, 0]] if any(down) else None X_down = X[down[:, 0]] if any(down) else None
y_down = y[down[:, 0]] if any(down) else None y_down = y[down[:, 0]] if any(down) else None
X_up = X[up[:, 0]] if any(up) else None X_up = X[up[:, 0]] if any(up) else None
y_up = y[up[:, 0]] if any(up) else None y_up = y[up[:, 0]] if any(up) else None
return X_up, y_up, X_down, y_down return [X_up, y_up, X_down, y_down]
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> list: def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
self._tree = self.train(X, y, title) self._tree = self.train(X, y, title)
return self return self
def train(self: Snode, X: np.ndarray, y: np.ndarray, title: str='') -> list: def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
if np.unique(y).shape[0] == 1: if np.unique(y).shape[0] == 1:
# onlyt 1 class => pure dataset # only 1 class => pure dataset
return Snode(np.array([]), 0, X, y, title + f', <pure> class={np.unique(y)} items={y.shape[0]}') return Snode(np.array([]), 0, X, y, title + f', class={np.unique(y)}, items={y.shape[0]}, rest=0, <pure> ')
# Train the model # Train the model
clf = LinearSVC(max_iter=self._max_iter, random_state=self._random_state) clf = LinearSVC(max_iter=self._max_iter, random_state=self._random_state)
clf.fit(X, y) clf.fit(X, y)
tree = Snode(clf.coef_, clf.intercept_, X, y, title) tree = Snode(clf.coef_, clf.intercept_, X, y, title)
#plot_hyperplane(clf, X, y, title) X_U, y_u, X_D, y_d = self._split_data(clf, X, y)
X_T, y_t, X_O, y_o = self._split_data(clf, X, y) if X_U is None or X_D is None:
if X_T is None or X_O is None:
# didn't part anything # didn't part anything
return Snode(clf.coef_, clf.intercept_, X, y, title + f', <couldn\'t go any further> classes={np.unique(y)} items<0>={y[y==0].shape[0]} items<1>={y[y==1].shape[0]}') return Snode(clf.coef_, clf.intercept_, X, y, title + f', classes={np.unique(y)}, items<0>={y[y==0].shape[0]}, items<1>={y[y==1].shape[0]}, <couldn\'t go any further>')
tree.set_up( self.train(X_T, y_t, title + ' - Up')) tree.set_up(self.train(X_U, y_u, title + ' - Up' + str(np.unique(y_u, return_counts=True))))
tree.set_down(self.train(X_O, y_o, title + ' - Down')) tree.set_down(self.train(X_D, y_d, title + ' - Down' + str(np.unique(y_d, return_counts=True))))
return tree return tree
def _print_tree(self, tree: Snode): def _print_tree(self, tree: Snode):
@@ -67,4 +73,30 @@ class Stree:
pointer = self._tree pointer = self._tree
self._print_tree(pointer) self._print_tree(pointer)
def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int):
"""Save the dataset of the node in a csv file
Arguments:
tree {Snode} -- node with data to save
number {int} -- a number to make different file names
"""
data = np.append(tree._X, tree._y.reshape(-1,1), axis=1)
name = f"{self.__folder}dataset{number}.csv"
np.savetxt(name, data, delimiter=",")
catalog.write(f"{name}, - {str(tree)}\n")
if tree.is_leaf():
return
self._save_datasets(tree.get_down(), catalog, number + 1)
self._save_datasets(tree.get_up(), catalog, number + 2)
def get_catalog_name(self):
return self.__folder + "catalog.txt"
def save_sub_datasets(self):
"""Save the every dataset stored in the tree to check with manual classifier
"""
pointer = self._tree
with open(self.get_catalog_name(), 'w', encoding = 'utf-8') as catalog:
self._save_datasets(pointer, catalog, 1)