From 371257c1211cdd59b8d5c95d65632f8d88f8c4a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Tue, 12 May 2020 17:36:16 +0200 Subject: [PATCH] Implement split data with or without using predictions & some tests --- .gitignore | 5 ++ .vscode/settings.json | 12 ---- main.py | 1 + test.ipynb | 128 ++++++++++++++++++++++++++++++++++++++++++ tests/Stree_test.py | 105 ++++++++++++++++++++++++++++++++-- trees/Snode.py | 8 +-- trees/Stree.py | 70 ++++++++++++++++------- 7 files changed, 290 insertions(+), 39 deletions(-) delete mode 100644 .vscode/settings.json create mode 100644 test.ipynb diff --git a/.gitignore b/.gitignore index b6e4761..343d3fc 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,8 @@ dmypy.json # Pyre type checker .pyre/ + +.idea +data/* + +.vscode \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index e3694a1..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "python.testing.unittestArgs": [ - "-v", - "-s", - "./tests", - "-p", - "*_test.py" - ], - "python.testing.pytestEnabled": false, - "python.testing.nosetestsEnabled": false, - "python.testing.unittestEnabled": true -} \ No newline at end of file diff --git a/main.py b/main.py index 29f14dc..a88c6e5 100644 --- a/main.py +++ b/main.py @@ -8,3 +8,4 @@ X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, model = Stree(random_state=random_state) model.fit(X, y) model.show_outcomes() +model.save_sub_datasets() \ No newline at end of file diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 0000000..dc1d2c8 --- /dev/null +++ b/test.ipynb @@ -0,0 +1,128 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np \n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.datasets import make_classification\n", + "\n", + "random_state = 1\n", + "X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n", + " n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,\n", + " class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=random_state)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "data/dataset1.csv - root\ndata/dataset2.csv - root - Down\ndata/dataset3.csv - root - Down - Down, classes=[0 1], items<0>=17, items<1>=691, LEAF accuracy=0.98\ndata/dataset4.csv - root - Down - Up\ndata/dataset5.csv - root - Down - Up - Down, classes=[0 1], items<0>=1, items<1>=3, LEAF accuracy=0.75\ndata/dataset6.csv - root - Down - Up - Up, class=[0], items=7, rest=0, LEAF accuracy=1.00\ndata/dataset3.csv - root - Up, classes=[0 1], items<0>=725, items<1>=56, LEAF accuracy=0.93\n" + } + ], + "source": [ + "!cat data/catalog.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def readsub(name):\n", + " data = np.genfromtxt(name, delimiter=',')\n", + " data = np.array(data)\n", + " py = data[:, data.shape[1] - 1]\n", + " px = np.delete(data, data.shape[1] - 1, axis=1)\n", + " return px, py\n", + "def localiza(X, px):\n", + " enc = False\n", + " for i in range(X.shape[0]):\n", + " if all(X[i, :] == px):\n", + " enc = True\n", + " print(f\" i={i} - X[{i}, :]={X[i, :]} - px={px} - y[{i}]={y[i]}\")\n", + " print(\"Encontrado:\", enc)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "px, py = readsub('data/dataset5.csv')\n", + "model = LinearSVC(random_state=1, max_iter=1000)\n", + "model.fit(px,py)\n", + "yp = model.predict(px)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "[1. 1. 1. 1.]\n[1. 1. 0. 1.]\n" + } + ], + "source": [ + "print(yp)\n", + "print(py)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "i=1132 - X[1132, :]=[-0.41453617 -0.38206564 0.54849331] - px=[-0.41453617 -0.38206564 0.54849331] - y[1132]=0\nEncontrado: True\n" + } + ], + "source": [ + "localiza(X, px[2, :])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python37664bitstreevenva9e4a4efdc1042b6b577bd15fbe145ee", + "display_name": "Python 3.7.6 64-bit ('stree': venv)" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/tests/Stree_test.py b/tests/Stree_test.py index c4210af..088d3fd 100644 --- a/tests/Stree_test.py +++ b/tests/Stree_test.py @@ -1,14 +1,111 @@ import unittest -from trees.Stree import Stree +from sklearn.svm import LinearSVC +from sklearn.datasets import make_classification +import numpy as np +import csv + +from trees.Stree import Stree, Snode class Stree_test(unittest.TestCase): - + def __init__(self, *args, **kwargs): - self.random_state = 17 - self._model = Stree(random_state=self.random_state) + self._random_state = 1 + self._model_tree = Stree(random_state=self._random_state, use_predictions=True) + self._model_tree.fit(*self._get_Xy()) + self._model_svm = LinearSVC(random_state=self._random_state, max_iter=self._model_tree._max_iter) super(Stree_test, self).__init__(*args, **kwargs) + + def _get_Xy(self): + X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, + n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, + class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=self._random_state) + return X, y def test_split_data(self): self.assertTrue(True) + def _check_tree(self, node: Snode): + if node.is_leaf(): + return + self._model_svm.fit(node._X, node._y) + y_prediction = self._model_svm.predict(node._X) + y_down = node.get_down()._y + y_up = node.get_up()._y + # Is a correct partition in terms of cadinality? + # i.e. The partition algorithm didn't forget any sample + self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0]) + unique_y, count_y = np.unique(node._y, return_counts=True) + _, count_d = np.unique(y_down, return_counts=True) + _, count_u = np.unique(y_up, return_counts=True) + for i in unique_y: + try: + number_down = count_d[i] + except: + number_down = 0 + try: + number_up = count_u[i] + except: + number_up = 0 + self.assertEqual(count_y[i], number_down + number_up) + # Is the partition made the same as the prediction? + # as the node is not a leaf... + unique_yp, count_yp = np.unique(y_prediction, return_counts=True) + self.assertEqual(count_yp[1], y_down.shape[0]) + self.assertEqual(count_yp[0], y_up.shape[0]) + self._check_tree(node.get_down()) + self._check_tree(node.get_up()) + + def test_build_tree(self): + """Check if the tree is built the same way as predictions of models + """ + self._check_tree(self._model_tree._tree) + + def _get_file_data(self, file_name: str) -> tuple: + """Return X, y from data, y is the last column in array + + Arguments: + file_name {str} -- the file name + + Returns: + tuple -- tuple with samples, categories + """ + data = np.genfromtxt(file_name, delimiter=',') + data = np.array(data) + column_y = data.shape[1] - 1 + fy = data[:, column_y] + fx = np.delete(data, column_y, axis=1) + return fx, fy + + def _find_out(self, px: np.array, x_original: np.array, y_original) -> list: + """Find the original values of y for a given array of samples + + Arguments: + px {np.array} -- array of samples to search for + x_original {np.array} -- original dataset + y_original {[type]} -- original classes + + Returns: + np.array -- classes of the given samples + """ + res = [] + for needle in px: + for row in range(x_original.shape[0]): + if all(x_original[row, :] == needle): + res.append(y_original[row]) + return res + + def test_subdatasets(self): + """Check if the subdatasets files have the same predictions as the tree itself + """ + model = LinearSVC(random_state=self._random_state, max_iter=self._model_tree._max_iter) + X, y = self._get_Xy() + model.fit(X, y) + self._model_tree.save_sub_datasets() + with open(self._model_tree.get_catalog_name()) as cat_file: + catalog = csv.reader(cat_file, delimiter=',') + for row in catalog: + X, y = self._get_Xy() + x_file, y_file = self._get_file_data(row[0]) + y_original = np.array(self._find_out(x_file, X, y), dtype=int) + self.assertTrue(np.array_equal(y_file, y_original)) diff --git a/trees/Snode.py b/trees/Snode.py index bb5ab35..2591c26 100644 --- a/trees/Snode.py +++ b/trees/Snode.py @@ -17,7 +17,7 @@ class Snode: self._y = y self._down = None self._up = None - self._class = None + self._class = None # really needed? def set_down(self, son): self._down = son @@ -28,13 +28,13 @@ class Snode: def is_leaf(self,) -> bool: return self._up is None and self._down is None - def get_down(self): + def get_down(self) -> 'Snode': return self._down - def get_up(self): + def get_up(self) -> 'Snode': return self._up - def __str__(self): + def __str__(self) -> str: if self.is_leaf(): num = 0 for i in np.unique(self._y): diff --git a/trees/Stree.py b/trees/Stree.py index 50b6efb..ab623eb 100644 --- a/trees/Stree.py +++ b/trees/Stree.py @@ -8,6 +8,7 @@ Uses LinearSVC ''' import numpy as np +import typing from sklearn.svm import LinearSVC from trees.Snode import Snode @@ -15,45 +16,50 @@ from trees.Snode import Snode class Stree: """ """ - def __init__(self, max_iter: int=1000, random_state: int=0): + def __init__(self, max_iter: int=1000, random_state: int=0, use_predictions: bool=False): self._max_iter = max_iter self._random_state = random_state self._outcomes = None self._tree = None + self.__folder = 'data/' + self.__use_predictions = use_predictions def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list: - # doesn't work with multiclass as each sample has to do inner product with its own coeficients - # computes positition of every sample is w.r.t. the hyperplane - coef = clf.coef_[0, :].reshape(-1, X.shape[1]) - intercept = clf.intercept_[0] - res = X.dot(coef.T) + intercept - down = res > 0 + if self.__use_predictions: + yp = clf.predict(X) + down = (yp == 1).reshape(-1, 1) + else: + # doesn't work with multiclass as each sample has to do inner product with its own coeficients + # computes positition of every sample is w.r.t. the hyperplane + coef = clf.coef_[0, :].reshape(-1, X.shape[1]) + intercept = clf.intercept_[0] + res = X.dot(coef.T) + intercept + down = res > 0 up = ~down X_down = X[down[:, 0]] if any(down) else None y_down = y[down[:, 0]] if any(down) else None X_up = X[up[:, 0]] if any(up) else None y_up = y[up[:, 0]] if any(up) else None - return X_up, y_up, X_down, y_down + return [X_up, y_up, X_down, y_down] - def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> list: + def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree': self._tree = self.train(X, y, title) return self - - def train(self: Snode, X: np.ndarray, y: np.ndarray, title: str='') -> list: + + def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode: if np.unique(y).shape[0] == 1: - # onlyt 1 class => pure dataset - return Snode(np.array([]), 0, X, y, title + f', class={np.unique(y)} items={y.shape[0]}') + # only 1 class => pure dataset + return Snode(np.array([]), 0, X, y, title + f', class={np.unique(y)}, items={y.shape[0]}, rest=0, ') # Train the model clf = LinearSVC(max_iter=self._max_iter, random_state=self._random_state) clf.fit(X, y) tree = Snode(clf.coef_, clf.intercept_, X, y, title) - #plot_hyperplane(clf, X, y, title) - X_T, y_t, X_O, y_o = self._split_data(clf, X, y) - if X_T is None or X_O is None: + X_U, y_u, X_D, y_d = self._split_data(clf, X, y) + if X_U is None or X_D is None: # didn't part anything - return Snode(clf.coef_, clf.intercept_, X, y, title + f', classes={np.unique(y)} items<0>={y[y==0].shape[0]} items<1>={y[y==1].shape[0]}') - tree.set_up( self.train(X_T, y_t, title + ' - Up')) - tree.set_down(self.train(X_O, y_o, title + ' - Down')) + return Snode(clf.coef_, clf.intercept_, X, y, title + f', classes={np.unique(y)}, items<0>={y[y==0].shape[0]}, items<1>={y[y==1].shape[0]}, ') + tree.set_up(self.train(X_U, y_u, title + ' - Up' + str(np.unique(y_u, return_counts=True)))) + tree.set_down(self.train(X_D, y_d, title + ' - Down' + str(np.unique(y_d, return_counts=True)))) return tree def _print_tree(self, tree: Snode): @@ -67,4 +73,30 @@ class Stree: pointer = self._tree self._print_tree(pointer) + def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int): + """Save the dataset of the node in a csv file + + Arguments: + tree {Snode} -- node with data to save + number {int} -- a number to make different file names + """ + data = np.append(tree._X, tree._y.reshape(-1,1), axis=1) + name = f"{self.__folder}dataset{number}.csv" + np.savetxt(name, data, delimiter=",") + catalog.write(f"{name}, - {str(tree)}\n") + if tree.is_leaf(): + return + self._save_datasets(tree.get_down(), catalog, number + 1) + self._save_datasets(tree.get_up(), catalog, number + 2) + + def get_catalog_name(self): + return self.__folder + "catalog.txt" + + def save_sub_datasets(self): + """Save the every dataset stored in the tree to check with manual classifier + """ + pointer = self._tree + with open(self.get_catalog_name(), 'w', encoding = 'utf-8') as catalog: + self._save_datasets(pointer, catalog, 1) +