Implement predict and score methods & tests

This commit is contained in:
2020-05-13 12:42:09 +02:00
parent c4de782a3f
commit 8f71eeb316
6 changed files with 70 additions and 221 deletions

View File

@@ -9,3 +9,4 @@ model = Stree(random_state=random_state)
model.fit(X, y)
print(model)
model.save_sub_datasets()
print(f"Prediciting [{y[0]}] we have {model.predict(X[0, :].reshape(-1, X.shape[1]))}")

View File

@@ -9,6 +9,7 @@
"import numpy as np \n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.datasets import make_classification\n",
"from trees.Stree import Stree\n",
"\n",
"random_state = 1\n",
"X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n",
@@ -18,215 +19,20 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "data/dataset1.csv - root\ndata/dataset2.csv - root - Down\ndata/dataset3.csv - root - Down - Down, classes=[0 1], items<0>=17, items<1>=691, <couldn't go any further> LEAF accuracy=0.98\ndata/dataset4.csv - root - Down - Up\ndata/dataset5.csv - root - Down - Up - Down, classes=[0 1], items<0>=1, items<1>=3, <couldn't go any further> LEAF accuracy=0.75\ndata/dataset6.csv - root - Down - Up - Up, class=[0], items=7, rest=0, <pure> LEAF accuracy=1.00\ndata/dataset3.csv - root - Up, classes=[0 1], items<0>=725, items<1>=56, <couldn't go any further> LEAF accuracy=0.93\n"
"text": "Accuracy: 0.950667\n"
}
],
"source": [
"!cat data/catalog.txt"
"clf = Stree(random_state=random_state, use_predictions=False)\n",
"clf.fit(X, y)\n",
"accuracy = clf.score(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def readsub(name):\n",
" data = np.genfromtxt(name, delimiter=',')\n",
" data = np.array(data)\n",
" py = data[:, data.shape[1] - 1]\n",
" px = np.delete(data, data.shape[1] - 1, axis=1)\n",
" return px, py\n",
"def localiza(X, px):\n",
" enc = False\n",
" for i in range(X.shape[0]):\n",
" if all(X[i, :] == px):\n",
" enc = True\n",
" print(f\" i={i} - X[{i}, :]={X[i, :]} - px={px} - y[{i}]={y[i]}\")\n",
" print(\"Encontrado:\", enc)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"px, py = readsub('data/dataset5.csv')\n",
"model = LinearSVC(random_state=1, max_iter=1000)\n",
"model.fit(px,py)\n",
"yp = model.predict(px)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "[1. 1. 1. 1.]\n[1. 1. 0. 1.]\n"
}
],
"source": [
"print(yp)\n",
"print(py)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "i=1132 - X[1132, :]=[-0.41453617 -0.38206564 0.54849331] - px=[-0.41453617 -0.38206564 0.54849331] - y[1132]=0\nEncontrado: True\n"
}
],
"source": [
"localiza(X, px[2, :])"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "[LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0), LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)]\n"
}
],
"source": [
"from sklearn.svm import LinearSVC\n",
"\n",
"data = []\n",
"for i in range(5):\n",
" model = LinearSVC()\n",
" data.append(model)\n",
"\n",
"print(data)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "4\n"
},
{
"output_type": "error",
"ename": "NameError",
"evalue": "name 'gato' is not defined",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-3-04351d05a6f0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpato\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgato\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'gato' is not defined"
]
}
],
"source": [
"def pato(k):\n",
" def gato(m, u):\n",
" return m * u\n",
" return gato(k, k)\n",
"\n",
"print(pato(2))\n",
"print(gato(3,4))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "7\n"
}
],
"source": [
"try:\n",
" a= max(5,3)/min(0,1)\n",
"except:\n",
" a=7\n",
"print(a)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "SyntaxError",
"evalue": "invalid syntax (<ipython-input-6-65e24c447a24>, line 1)",
"traceback": [
"\u001b[0;36m File \u001b[0;32m\"<ipython-input-6-65e24c447a24>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m max([2 5])\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
]
}
],
"source": [
"max([2 5])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"y=[1,2,4,5,5,5,5,3,3,3,2,]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"a,b = np.unique(y, return_counts=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "11"
},
"metadata": {},
"execution_count": 12
}
],
"source": [
"np.count_nonzero(y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {

View File

@@ -11,9 +11,9 @@ class Snode_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
self._random_state = 1
self._model = Stree(random_state=self._random_state,
self._clf = Stree(random_state=self._random_state,
use_predictions=True)
self._model.fit(*self._get_Xy())
self._clf.fit(*self._get_Xy())
super(Snode_test, self).__init__(*args, **kwargs)
def _get_Xy(self):
@@ -42,4 +42,4 @@ class Snode_test(unittest.TestCase):
return
check_leave(node.get_down())
check_leave(node.get_up())
check_leave(self._model._tree)
check_leave(self._clf._tree)

View File

@@ -11,9 +11,9 @@ class Stree_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
self._random_state = 1
self._model = Stree(random_state=self._random_state,
use_predictions=True)
self._model.fit(*self._get_Xy())
self._clf = Stree(random_state=self._random_state,
use_predictions=False)
self._clf.fit(*self._get_Xy())
super(Stree_test, self).__init__(*args, **kwargs)
def _get_Xy(self):
@@ -25,7 +25,7 @@ class Stree_test(unittest.TestCase):
def _check_tree(self, node: Snode):
if node.is_leaf():
return
y_prediction = node._model.predict(node._X)
y_prediction = node._clf.predict(node._X)
y_down = node.get_down()._y
y_up = node.get_up()._y
# Is a correct partition in terms of cadinality?
@@ -55,7 +55,7 @@ class Stree_test(unittest.TestCase):
def test_build_tree(self):
"""Check if the tree is built the same way as predictions of models
"""
self._check_tree(self._model._tree)
self._check_tree(self._clf._tree)
def _get_file_data(self, file_name: str) -> tuple:
"""Return X, y from data, y is the last column in array
@@ -94,14 +94,32 @@ class Stree_test(unittest.TestCase):
def test_subdatasets(self):
"""Check if the subdatasets files have the same predictions as the tree itself
"""
model = self._model._tree._model
model = self._clf._tree._clf
X, y = self._get_Xy()
model.fit(X, y)
self._model.save_sub_datasets()
with open(self._model.get_catalog_name()) as cat_file:
self._clf.save_sub_datasets()
with open(self._clf.get_catalog_name()) as cat_file:
catalog = csv.reader(cat_file, delimiter=',')
for row in catalog:
X, y = self._get_Xy()
x_file, y_file = self._get_file_data(row[0])
y_original = np.array(self._find_out(x_file, X, y), dtype=int)
self.assertTrue(np.array_equal(y_file, y_original))
def test_single_prediction(self):
X, y = self._get_Xy()
yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
self.assertEqual(yp[0], y[0])
def test_multiple_prediction(self):
X, y = self._get_Xy()
yp = self._clf.predict(X[:23, :])
self.assertListEqual(y[:23].tolist(), yp.tolist())
def test_score(self):
X, y = self._get_Xy()
accuracy_score = self._clf.score(X, y, print_out=False)
yp = self._clf.predict(X)
right = (yp == y).astype(int)
accuracy_computed = sum(right) / len(y)
self.assertEqual(accuracy_score, accuracy_computed)

View File

@@ -2,7 +2,7 @@
__author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
__license__ = "MIT"
__version__ = "1.0"
__version__ = "0.9"
Node of the Stree (binary tree)
'''
@@ -11,10 +11,10 @@ from sklearn.svm import LinearSVC
class Snode:
def __init__(self, model: LinearSVC, X: np.ndarray, y: np.ndarray, title: str):
self._model = model
self._vector = None if model is None else model.coef_
self._interceptor = 0 if model is None else model.intercept_
def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str):
self._clf = clf
self._vector = None if clf is None else clf.coef_
self._interceptor = 0 if clf is None else clf.intercept_
self._title = title
self._belief = 0 # belief of the prediction in a leaf node based on samples
self._X = X
@@ -60,6 +60,6 @@ class Snode:
num = max(num, self._y[self._y == i].shape[0])
den = self._y.shape[0]
accuracy = num / den if den != 0 else 1
return f"{self._title} LEAF accuracy={accuracy:.2f}\n"
return f"{self._title} LEAF accuracy={accuracy:.2f}, belief={self._belief:.2f} class={self._class}\n"
else:
return f"{self._title}\n"

View File

@@ -2,8 +2,8 @@
__author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
__license__ = "MIT"
__version__ = "1.0"
Create a oblique tree classifier based on SVM Trees
__version__ = "0.9"
Build an oblique tree classifier based on SVM Trees
Uses LinearSVC
'''
@@ -25,6 +25,7 @@ class Stree:
self._tree = None
self.__folder = 'data/'
self.__use_predictions = use_predictions
self.__trained = False
def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list:
if self.__use_predictions:
@@ -46,10 +47,11 @@ class Stree:
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
self._tree = self.train(X, y, title)
self._predictor()
self._build_predictor()
self.__trained = True
return self
def _predictor(self):
def _build_predictor(self):
"""Process the leaves to make them predictors
"""
def run_tree(node: Snode):
@@ -79,6 +81,28 @@ class Stree:
str(np.unique(y_d, return_counts=True))))
return tree
def predict(self, X: np.array) -> np.array:
def predict_class(xp: np.array, tree: Snode) -> np.array:
if tree.is_leaf():
return tree._class
coef = tree._vector[0, :].reshape(-1, xp.shape[1])
if xp.dot(coef.T) + tree._interceptor[0] > 0:
return predict_class(xp, tree.get_down())
return predict_class(xp, tree.get_up())
y = np.array([], dtype=int)
for xp in X:
y = np.append(y, predict_class(xp.reshape(-1, X.shape[1]), self._tree))
return y
def score(self, X: np.array, y: np.array, print_out=True) -> float:
self.fit(X, y)
yp = self.predict(X)
right = (yp == y).astype(int)
accuracy = sum(right) / len(y)
if print_out:
print(f"Accuracy: {accuracy:.6f}")
return accuracy
def __str__(self):
def print_tree(tree: Snode) -> str:
output = str(tree)