diff --git a/main.py b/main.py index d0f0944..82d227d 100644 --- a/main.py +++ b/main.py @@ -9,3 +9,4 @@ model = Stree(random_state=random_state) model.fit(X, y) print(model) model.save_sub_datasets() +print(f"Prediciting [{y[0]}] we have {model.predict(X[0, :].reshape(-1, X.shape[1]))}") diff --git a/test.ipynb b/test.ipynb index abcde1f..b19b4c2 100644 --- a/test.ipynb +++ b/test.ipynb @@ -9,6 +9,7 @@ "import numpy as np \n", "from sklearn.svm import LinearSVC\n", "from sklearn.datasets import make_classification\n", + "from trees.Stree import Stree\n", "\n", "random_state = 1\n", "X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n", @@ -18,215 +19,20 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", - "text": "data/dataset1.csv - root\ndata/dataset2.csv - root - Down\ndata/dataset3.csv - root - Down - Down, classes=[0 1], items<0>=17, items<1>=691, LEAF accuracy=0.98\ndata/dataset4.csv - root - Down - Up\ndata/dataset5.csv - root - Down - Up - Down, classes=[0 1], items<0>=1, items<1>=3, LEAF accuracy=0.75\ndata/dataset6.csv - root - Down - Up - Up, class=[0], items=7, rest=0, LEAF accuracy=1.00\ndata/dataset3.csv - root - Up, classes=[0 1], items<0>=725, items<1>=56, LEAF accuracy=0.93\n" + "text": "Accuracy: 0.950667\n" } ], "source": [ - "!cat data/catalog.txt" + "clf = Stree(random_state=random_state, use_predictions=False)\n", + "clf.fit(X, y)\n", + "accuracy = clf.score(X, y)" ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def readsub(name):\n", - " data = np.genfromtxt(name, delimiter=',')\n", - " data = np.array(data)\n", - " py = data[:, data.shape[1] - 1]\n", - " px = np.delete(data, data.shape[1] - 1, axis=1)\n", - " return px, py\n", - "def localiza(X, px):\n", - " enc = False\n", - " for i in range(X.shape[0]):\n", - " if all(X[i, :] == px):\n", - " enc = True\n", - " print(f\" i={i} - X[{i}, :]={X[i, :]} - px={px} - y[{i}]={y[i]}\")\n", - " print(\"Encontrado:\", enc)\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "px, py = readsub('data/dataset5.csv')\n", - "model = LinearSVC(random_state=1, max_iter=1000)\n", - "model.fit(px,py)\n", - "yp = model.predict(px)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "[1. 1. 1. 1.]\n[1. 1. 0. 1.]\n" - } - ], - "source": [ - "print(yp)\n", - "print(py)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "i=1132 - X[1132, :]=[-0.41453617 -0.38206564 0.54849331] - px=[-0.41453617 -0.38206564 0.54849331] - y[1132]=0\nEncontrado: True\n" - } - ], - "source": [ - "localiza(X, px[2, :])" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "[LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)]\n" - } - ], - "source": [ - "from sklearn.svm import LinearSVC\n", - "\n", - "data = []\n", - "for i in range(5):\n", - " model = LinearSVC()\n", - " data.append(model)\n", - "\n", - "print(data)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "4\n" - }, - { - "output_type": "error", - "ename": "NameError", - "evalue": "name 'gato' is not defined", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpato\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgato\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'gato' is not defined" - ] - } - ], - "source": [ - "def pato(k):\n", - " def gato(m, u):\n", - " return m * u\n", - " return gato(k, k)\n", - "\n", - "print(pato(2))\n", - "print(gato(3,4))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "7\n" - } - ], - "source": [ - "try:\n", - " a= max(5,3)/min(0,1)\n", - "except:\n", - " a=7\n", - "print(a)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "output_type": "error", - "ename": "SyntaxError", - "evalue": "invalid syntax (, line 1)", - "traceback": [ - "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m max([2 5])\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" - ] - } - ], - "source": [ - "max([2 5])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "y=[1,2,4,5,5,5,5,3,3,3,2,]" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "a,b = np.unique(y, return_counts=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": "11" - }, - "metadata": {}, - "execution_count": 12 - } - ], - "source": [ - "np.count_nonzero(y)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] } ], "metadata": { diff --git a/tests/Snode_test.py b/tests/Snode_test.py index 34c1620..3e57830 100644 --- a/tests/Snode_test.py +++ b/tests/Snode_test.py @@ -11,9 +11,9 @@ class Snode_test(unittest.TestCase): def __init__(self, *args, **kwargs): self._random_state = 1 - self._model = Stree(random_state=self._random_state, + self._clf = Stree(random_state=self._random_state, use_predictions=True) - self._model.fit(*self._get_Xy()) + self._clf.fit(*self._get_Xy()) super(Snode_test, self).__init__(*args, **kwargs) def _get_Xy(self): @@ -42,4 +42,4 @@ class Snode_test(unittest.TestCase): return check_leave(node.get_down()) check_leave(node.get_up()) - check_leave(self._model._tree) + check_leave(self._clf._tree) diff --git a/tests/Stree_test.py b/tests/Stree_test.py index e56bb9a..489487e 100644 --- a/tests/Stree_test.py +++ b/tests/Stree_test.py @@ -11,9 +11,9 @@ class Stree_test(unittest.TestCase): def __init__(self, *args, **kwargs): self._random_state = 1 - self._model = Stree(random_state=self._random_state, - use_predictions=True) - self._model.fit(*self._get_Xy()) + self._clf = Stree(random_state=self._random_state, + use_predictions=False) + self._clf.fit(*self._get_Xy()) super(Stree_test, self).__init__(*args, **kwargs) def _get_Xy(self): @@ -25,7 +25,7 @@ class Stree_test(unittest.TestCase): def _check_tree(self, node: Snode): if node.is_leaf(): return - y_prediction = node._model.predict(node._X) + y_prediction = node._clf.predict(node._X) y_down = node.get_down()._y y_up = node.get_up()._y # Is a correct partition in terms of cadinality? @@ -55,7 +55,7 @@ class Stree_test(unittest.TestCase): def test_build_tree(self): """Check if the tree is built the same way as predictions of models """ - self._check_tree(self._model._tree) + self._check_tree(self._clf._tree) def _get_file_data(self, file_name: str) -> tuple: """Return X, y from data, y is the last column in array @@ -94,14 +94,32 @@ class Stree_test(unittest.TestCase): def test_subdatasets(self): """Check if the subdatasets files have the same predictions as the tree itself """ - model = self._model._tree._model + model = self._clf._tree._clf X, y = self._get_Xy() model.fit(X, y) - self._model.save_sub_datasets() - with open(self._model.get_catalog_name()) as cat_file: + self._clf.save_sub_datasets() + with open(self._clf.get_catalog_name()) as cat_file: catalog = csv.reader(cat_file, delimiter=',') for row in catalog: X, y = self._get_Xy() x_file, y_file = self._get_file_data(row[0]) y_original = np.array(self._find_out(x_file, X, y), dtype=int) self.assertTrue(np.array_equal(y_file, y_original)) + + def test_single_prediction(self): + X, y = self._get_Xy() + yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1]))) + self.assertEqual(yp[0], y[0]) + + def test_multiple_prediction(self): + X, y = self._get_Xy() + yp = self._clf.predict(X[:23, :]) + self.assertListEqual(y[:23].tolist(), yp.tolist()) + + def test_score(self): + X, y = self._get_Xy() + accuracy_score = self._clf.score(X, y, print_out=False) + yp = self._clf.predict(X) + right = (yp == y).astype(int) + accuracy_computed = sum(right) / len(y) + self.assertEqual(accuracy_score, accuracy_computed) diff --git a/trees/Snode.py b/trees/Snode.py index 70f0070..38b7a81 100644 --- a/trees/Snode.py +++ b/trees/Snode.py @@ -2,7 +2,7 @@ __author__ = "Ricardo Montañana Gómez" __copyright__ = "Copyright 2020, Ricardo Montañana Gómez" __license__ = "MIT" -__version__ = "1.0" +__version__ = "0.9" Node of the Stree (binary tree) ''' @@ -11,10 +11,10 @@ from sklearn.svm import LinearSVC class Snode: - def __init__(self, model: LinearSVC, X: np.ndarray, y: np.ndarray, title: str): - self._model = model - self._vector = None if model is None else model.coef_ - self._interceptor = 0 if model is None else model.intercept_ + def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str): + self._clf = clf + self._vector = None if clf is None else clf.coef_ + self._interceptor = 0 if clf is None else clf.intercept_ self._title = title self._belief = 0 # belief of the prediction in a leaf node based on samples self._X = X @@ -60,6 +60,6 @@ class Snode: num = max(num, self._y[self._y == i].shape[0]) den = self._y.shape[0] accuracy = num / den if den != 0 else 1 - return f"{self._title} LEAF accuracy={accuracy:.2f}\n" + return f"{self._title} LEAF accuracy={accuracy:.2f}, belief={self._belief:.2f} class={self._class}\n" else: return f"{self._title}\n" diff --git a/trees/Stree.py b/trees/Stree.py index da40707..1b18802 100644 --- a/trees/Stree.py +++ b/trees/Stree.py @@ -2,8 +2,8 @@ __author__ = "Ricardo Montañana Gómez" __copyright__ = "Copyright 2020, Ricardo Montañana Gómez" __license__ = "MIT" -__version__ = "1.0" -Create a oblique tree classifier based on SVM Trees +__version__ = "0.9" +Build an oblique tree classifier based on SVM Trees Uses LinearSVC ''' @@ -25,6 +25,7 @@ class Stree: self._tree = None self.__folder = 'data/' self.__use_predictions = use_predictions + self.__trained = False def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list: if self.__use_predictions: @@ -46,10 +47,11 @@ class Stree: def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree': self._tree = self.train(X, y, title) - self._predictor() + self._build_predictor() + self.__trained = True return self - def _predictor(self): + def _build_predictor(self): """Process the leaves to make them predictors """ def run_tree(node: Snode): @@ -79,6 +81,28 @@ class Stree: str(np.unique(y_d, return_counts=True)))) return tree + def predict(self, X: np.array) -> np.array: + def predict_class(xp: np.array, tree: Snode) -> np.array: + if tree.is_leaf(): + return tree._class + coef = tree._vector[0, :].reshape(-1, xp.shape[1]) + if xp.dot(coef.T) + tree._interceptor[0] > 0: + return predict_class(xp, tree.get_down()) + return predict_class(xp, tree.get_up()) + y = np.array([], dtype=int) + for xp in X: + y = np.append(y, predict_class(xp.reshape(-1, X.shape[1]), self._tree)) + return y + + def score(self, X: np.array, y: np.array, print_out=True) -> float: + self.fit(X, y) + yp = self.predict(X) + right = (yp == y).astype(int) + accuracy = sum(right) / len(y) + if print_out: + print(f"Accuracy: {accuracy:.6f}") + return accuracy + def __str__(self): def print_tree(tree: Snode) -> str: output = str(tree)