stree/tests/Stree_test.py

import unittest

from sklearn.datasets import make_classification
import os
import numpy as np
import csv

from trees.Stree import Stree, Snode


class Stree_test(unittest.TestCase):

    def __init__(self, *args, **kwargs):
        os.environ['TESTING'] = '1'
        self._random_state = 1
        self._clf = Stree(random_state=self._random_state,
                            use_predictions=False)
        self._clf.fit(*self._get_Xy())
        super(Stree_test, self).__init__(*args, **kwargs)

    @classmethod
    def tearDownClass(cls):
        try:
            os.environ.pop('TESTING')
        except:
            pass

    def _get_Xy(self):
        X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
                                   n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
                                   class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
        return X, y

    def _check_tree(self, node: Snode):
        if node.is_leaf():
            return
        y_prediction = node._clf.predict(node._X)
        y_down = node.get_down()._y
        y_up = node.get_up()._y
        # Is a correct partition in terms of cadinality?
        # i.e. The partition algorithm didn't forget any sample
        self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
        unique_y, count_y = np.unique(node._y, return_counts=True)
        _, count_d = np.unique(y_down, return_counts=True)
        _, count_u = np.unique(y_up, return_counts=True)
        for i in unique_y:
            try:
                number_down = count_d[i]
            except:
                number_down = 0
            try:
                number_up = count_u[i]
            except:
                number_up = 0
            self.assertEqual(count_y[i], number_down + number_up)
        # Is the partition made the same as the prediction?
        # as the node is not a leaf...
        unique_yp, count_yp = np.unique(y_prediction, return_counts=True)
        self.assertEqual(count_yp[1], y_down.shape[0])
        self.assertEqual(count_yp[0], y_up.shape[0])
        self._check_tree(node.get_down())
        self._check_tree(node.get_up())

    def test_build_tree(self):
        """Check if the tree is built the same way as predictions of models
        """
        self._check_tree(self._clf._tree)

    def _get_file_data(self, file_name: str) -> tuple:
        """Return X, y from data, y is the last column in array

        Arguments:
            file_name {str} -- the file name

        Returns:
            tuple -- tuple with samples, categories
        """
        data = np.genfromtxt(file_name, delimiter=',')
        data = np.array(data)
        column_y = data.shape[1] - 1
        fy = data[:, column_y]
        fx = np.delete(data, column_y, axis=1)
        return fx, fy

    def _find_out(self, px: np.array, x_original: np.array, y_original) -> list:
        """Find the original values of y for a given array of samples

        Arguments:
            px {np.array} -- array of samples to search for
            x_original {np.array} -- original dataset
            y_original {[type]} -- original classes

        Returns:
            np.array -- classes of the given samples
        """
        res = []
        for needle in px:
            for row in range(x_original.shape[0]):
                if all(x_original[row, :] == needle):
                    res.append(y_original[row])
        return res

    def test_subdatasets(self):
        """Check if the subdatasets files have the same predictions as the tree itself
        """
        model = self._clf._tree._clf
        X, y = self._get_Xy()
        model.fit(X, y)
        self._clf.save_sub_datasets()
        with open(self._clf.get_catalog_name()) as cat_file:
            catalog = csv.reader(cat_file, delimiter=',')
            for row in catalog:
                X, y = self._get_Xy()
                x_file, y_file = self._get_file_data(row[0])
                y_original = np.array(self._find_out(x_file, X, y), dtype=int)
                self.assertTrue(np.array_equal(y_file, y_original))

    def test_single_prediction(self):
        X, y = self._get_Xy()
        yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
        self.assertEqual(yp[0], y[0])

    def test_multiple_prediction(self):
        # First 27 elements the predictions are the same as the truth
        num = 27
        X, y = self._get_Xy()
        yp = self._clf.predict(X[:num, :])
        self.assertListEqual(y[:num].tolist(), yp.tolist())

    def test_score(self):
        X, y = self._get_Xy()
        accuracy_score = self._clf.score(X, y, print_out=False)
        yp = self._clf.predict(X)
        right = (yp == y).astype(int)
        accuracy_computed = sum(right) / len(y)
        self.assertEqual(accuracy_score, accuracy_computed)

    def test_single_predict_proba(self):
        # Element 28 has a different prediction than the truth
        X, y = self._get_Xy()
        yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
        self.assertEqual(0, yp[0:, 0])
        self.assertEqual(0.9282970550576184, yp[0:, 1])

    def test_multiple_predict_proba(self):
        # First 27 elements the predictions are the same as the truth
        num = 27
        X, y = self._get_Xy()
        yp = self._clf.predict_proba(X[:num, :])
        self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
        expected_proba = [0.9759887,  0.92829706, 0.9759887,  0.92829706, 0.92829706, 0.9759887,
                        0.92829706, 0.9759887,  0.9759887,  0.9759887,  0.9759887,  0.92829706,
                        0.92829706, 0.9759887,  0.92829706, 0.92829706, 0.92829706, 0.92829706,
                        0.9759887,  0.92829706, 0.9759887,  0.92829706, 0.92829706, 0.92829706,
                        0.92829706, 0.92829706, 0.9759887 ]
        self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())