Fix problem in _min_distance

Remove grapher (moved to another repo)
This commit is contained in:
2020-06-12 00:50:25 +02:00
parent 647d21bdb5
commit 1bfe273a70
11 changed files with 147 additions and 846 deletions

File diff suppressed because one or more lines are too long

View File

@@ -1,5 +1,4 @@
numpy
scikit-learn
pandas
matplotlib
ipympl

View File

@@ -30,7 +30,7 @@ setuptools.setup(
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Intended Audience :: Science/Research",
],
install_requires=["scikit-learn>=0.23.0", "numpy", "matplotlib", "ipympl"],
install_requires=["scikit-learn>=0.23.0", "numpy", "ipympl"],
test_suite="stree.tests",
zip_safe=False,
)

View File

@@ -96,9 +96,6 @@ class Siterator:
self._stack = []
self._push(tree)
def __iter__(self):
return self
def _push(self, node: Snode):
if node is not None:
self._stack.append(node)
@@ -184,7 +181,9 @@ class Stree(BaseEstimator, ClassifierMixin):
def _min_distance(self, data: np.array, _) -> np.array:
# chooses the lowest distance of every sample
indices = np.argmin(np.abs(data), axis=1)
return np.take(data, indices)
return np.array(
[data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
)
def _max_samples(self, data: np.array, y: np.array) -> np.array:
# select the class with max number of samples

View File

@@ -1,205 +0,0 @@
"""
__author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
__license__ = "MIT"
__version__ = "0.9"
Plot 3D views of nodes in Stree
"""
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from .Strees import Stree, Snode, Siterator
class Snode_graph(Snode):
def __init__(self, node: Stree):
self._plot_size = (8, 8)
self._xlimits = (None, None)
self._ylimits = (None, None)
self._zlimits = (None, None)
n = Snode.copy(node)
super().__init__(n._clf, n._X, n._y, n._title)
def set_plot_size(self, size: tuple):
self._plot_size = size
def get_plot_size(self) -> tuple:
return self._plot_size
def _is_pure(self) -> bool:
"""is considered pure a leaf node with one label
"""
if self.is_leaf():
return self._belief == 1.0
return False
def set_axis_limits(self, limits: tuple):
self._xlimits, self._ylimits, self._zlimits = limits
def get_axis_limits(self) -> tuple:
return self._xlimits, self._ylimits, self._zlimits
def _set_graphics_axis(self, ax: Axes3D):
ax.set_xlim(self._xlimits)
ax.set_ylim(self._ylimits)
ax.set_zlim(self._zlimits)
def save_hyperplane(
self, save_folder: str = "./", save_prefix: str = "", save_seq: int = 1
):
_, fig = self.plot_hyperplane()
name = os.path.join(save_folder, f"{save_prefix}STnode{save_seq}.png")
fig.savefig(name, bbox_inches="tight")
plt.close(fig)
def _get_cmap(self):
cmap = "jet"
if self._is_pure() and self._class == 1:
cmap = "jet_r"
return cmap
def _graph_title(self):
n_class, card = np.unique(self._y, return_counts=True)
return f"{self._title} {n_class} {card}"
def plot_hyperplane(self, plot_distribution: bool = True):
fig = plt.figure(figsize=self._plot_size)
ax = fig.add_subplot(1, 1, 1, projection="3d")
if not self._is_pure():
# Can't plot hyperplane of leaves with one label because it hasn't
# classiffier
# get the splitting hyperplane
def hyperplane(x, y):
return (
-self._clf.intercept_
- self._clf.coef_[0][0] * x
- self._clf.coef_[0][1] * y
) / self._clf.coef_[0][2]
tmpx = np.linspace(self._X[:, 0].min(), self._X[:, 0].max())
tmpy = np.linspace(self._X[:, 1].min(), self._X[:, 1].max())
xx, yy = np.meshgrid(tmpx, tmpy)
ax.plot_surface(
xx,
yy,
hyperplane(xx, yy),
alpha=0.5,
antialiased=True,
rstride=1,
cstride=1,
cmap="seismic",
)
self._set_graphics_axis(ax)
if plot_distribution:
self.plot_distribution(ax)
else:
plt.title(self._graph_title())
plt.show()
return ax, fig
def plot_distribution(self, ax: Axes3D = None):
if ax is None:
fig = plt.figure(figsize=self._plot_size)
ax = fig.add_subplot(1, 1, 1, projection="3d")
plt.title(self._graph_title())
cmap = self._get_cmap()
ax.scatter(
self._X[:, 0], self._X[:, 1], self._X[:, 2], c=self._y, cmap=cmap
)
ax.set_xlabel("X0")
ax.set_ylabel("X1")
ax.set_zlabel("X2")
plt.show()
class Stree_grapher(Stree):
"""Build 3d graphs of any dataset, if it's more than 3 features PCA shall
make its magic
"""
def __init__(self, params: dict):
self._plot_size = (8, 8)
self._tree_gr = None
# make Snode store X's
os.environ["TESTING"] = "1"
self._fitted = False
self._pca = None
super().__init__(**params)
def __del__(self):
try:
os.environ.pop("TESTING")
except KeyError:
pass
def _copy_tree(self, node: Snode) -> Snode_graph:
mirror = Snode_graph(node)
# clone node
mirror._class = node._class
mirror._belief = node._belief
if node.get_down() is not None:
mirror.set_down(self._copy_tree(node.get_down()))
if node.get_up() is not None:
mirror.set_up(self._copy_tree(node.get_up()))
return mirror
def fit(
self, X: np.array, y: np.array, sample_weight: np.array = None
) -> "Stree_grapher":
"""Fit the Stree and copy the tree in a Snode_graph tree
:param X: Dataset
:type X: np.array
:param y: Labels
:type y: np.array
:return: Stree model
:rtype: Stree
"""
if X.shape[1] != 3:
self._pca = PCA(n_components=3)
X = self._pca.fit_transform(X)
super().fit(X, y, sample_weight=sample_weight)
self._tree_gr = self._copy_tree(self.tree_)
self._fitted = True
return self
def score(self, X: np.array, y: np.array) -> float:
self._check_fitted()
if X.shape[1] != 3:
X = self._pca.transform(X)
return super().score(X, y)
def _check_fitted(self):
if not self._fitted:
raise Exception("Have to fit the grapher first!")
def save_all(self, save_folder: str = "./", save_prefix: str = ""):
"""Save all the node plots in png format, each with a sequence number
:param save_folder: folder where the plots are saved, defaults to './'
:type save_folder: str, optional
"""
self._check_fitted()
if not os.path.isdir(save_folder):
os.mkdir(save_folder)
seq = 1
for node in self:
node.save_hyperplane(
save_folder=save_folder, save_prefix=save_prefix, save_seq=seq
)
seq += 1
def plot_all(self):
"""Plots all the nodes
"""
self._check_fitted()
for node in self:
node.plot_hyperplane()
def __iter__(self):
return Siterator(self._tree_gr)

View File

@@ -1,4 +1,3 @@
from .Strees import Stree, Snode, Siterator
from .Strees_grapher import Stree_grapher, Snode_graph
__all__ = ["Stree", "Snode", "Siterator", "Stree_grapher", "Snode_graph"]
__all__ = ["Stree", "Snode", "Siterator"]

91
stree/tests/Snode_test.py Normal file
View File

@@ -0,0 +1,91 @@
import os
import unittest
import numpy as np
from stree import Stree, Snode
from .utils import get_dataset
class Snode_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
self._random_state = 1
self._clf = Stree(random_state=self._random_state)
self._clf.fit(*get_dataset(self._random_state))
super().__init__(*args, **kwargs)
@classmethod
def setUp(cls):
os.environ["TESTING"] = "1"
def test_attributes_in_leaves(self):
"""Check if the attributes in leaves have correct values so they form a
predictor
"""
def check_leave(node: Snode):
if not node.is_leaf():
check_leave(node.get_down())
check_leave(node.get_up())
return
# Check Belief in leave
classes, card = np.unique(node._y, return_counts=True)
max_card = max(card)
min_card = min(card)
if len(classes) > 1:
try:
belief = max_card / (max_card + min_card)
except ZeroDivisionError:
belief = 0.0
else:
belief = 1
self.assertEqual(belief, node._belief)
# Check Class
class_computed = classes[card == max_card]
self.assertEqual(class_computed, node._class)
check_leave(self._clf.tree_)
def test_nodes_coefs(self):
"""Check if the nodes of the tree have the right attributes filled
"""
def run_tree(node: Snode):
if node._belief < 1:
# only exclude pure leaves
self.assertIsNotNone(node._clf)
self.assertIsNotNone(node._clf.coef_)
if node.is_leaf():
return
run_tree(node.get_down())
run_tree(node.get_up())
run_tree(self._clf.tree_)
def test_make_predictor_on_leaf(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], "test")
test.make_predictor()
self.assertEqual(1, test._class)
self.assertEqual(0.75, test._belief)
def test_make_predictor_on_not_leaf(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], "test")
test.set_up(Snode(None, [1], [1], "another_test"))
test.make_predictor()
self.assertIsNone(test._class)
self.assertEqual(0, test._belief)
def test_make_predictor_on_leaf_bogus_data(self):
test = Snode(None, [1, 2, 3, 4], [], "test")
test.make_predictor()
self.assertIsNone(test._class)
def test_copy_node(self):
px = [1, 2, 3, 4]
py = [1]
test = Snode(Stree(), px, py, "test")
computed = Snode.copy(test)
self.assertListEqual(computed._X, px)
self.assertListEqual(computed._y, py)
self.assertEqual("test", computed._title)
self.assertIsInstance(computed._clf, Stree)

View File

@@ -2,25 +2,10 @@ import os
import unittest
import numpy as np
from sklearn.datasets import make_classification, load_iris
from sklearn.datasets import load_iris
from stree import Stree, Snode
def get_dataset(random_state=0, n_classes=2):
X, y = make_classification(
n_samples=1500,
n_features=3,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=n_classes,
n_clusters_per_class=2,
class_sep=1.5,
flip_y=0,
random_state=random_state,
)
return X, y
from .utils import get_dataset
class Stree_test(unittest.TestCase):
@@ -280,76 +265,33 @@ class Stree_test(unittest.TestCase):
outcome = outcomes[name][f"{criteria} {kernel}"]
self.assertAlmostEqual(outcome, clf.score(px, py))
def test_min_distance(self):
clf = Stree()
data = np.array(
[
[-0.1, 0.2, -0.3],
[0.7, 0.01, -0.1],
[0.7, -0.9, 0.5],
[0.1, 0.2, 0.3],
]
)
expected = np.array([-0.1, 0.01, 0.5, 0.1])
computed = clf._min_distance(data, None)
self.assertEqual((4,), computed.shape)
self.assertListEqual(expected.tolist(), computed.tolist())
class Snode_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
self._random_state = 1
self._clf = Stree(random_state=self._random_state)
self._clf.fit(*get_dataset(self._random_state))
super().__init__(*args, **kwargs)
@classmethod
def setUp(cls):
os.environ["TESTING"] = "1"
def test_attributes_in_leaves(self):
"""Check if the attributes in leaves have correct values so they form a
predictor
"""
def check_leave(node: Snode):
if not node.is_leaf():
check_leave(node.get_down())
check_leave(node.get_up())
return
# Check Belief in leave
classes, card = np.unique(node._y, return_counts=True)
max_card = max(card)
min_card = min(card)
if len(classes) > 1:
try:
belief = max_card / (max_card + min_card)
except ZeroDivisionError:
belief = 0.0
else:
belief = 1
self.assertEqual(belief, node._belief)
# Check Class
class_computed = classes[card == max_card]
self.assertEqual(class_computed, node._class)
check_leave(self._clf.tree_)
def test_nodes_coefs(self):
"""Check if the nodes of the tree have the right attributes filled
"""
def run_tree(node: Snode):
if node._belief < 1:
# only exclude pure leaves
self.assertIsNotNone(node._clf)
self.assertIsNotNone(node._clf.coef_)
if node.is_leaf():
return
run_tree(node.get_down())
run_tree(node.get_up())
run_tree(self._clf.tree_)
def test_make_predictor_on_leaf(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], "test")
test.make_predictor()
self.assertEqual(1, test._class)
self.assertEqual(0.75, test._belief)
def test_make_predictor_on_not_leaf(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], "test")
test.set_up(Snode(None, [1], [1], "another_test"))
test.make_predictor()
self.assertIsNone(test._class)
self.assertEqual(0, test._belief)
def test_make_predictor_on_leaf_bogus_data(self):
test = Snode(None, [1, 2, 3, 4], [], "test")
test.make_predictor()
self.assertIsNone(test._class)
def test_max_samples(self):
clf = Stree()
data = np.array(
[
[-0.1, 0.2, -0.3],
[0.7, 0.01, -0.1],
[0.7, -0.9, 0.5],
[0.1, 0.2, 0.3],
]
)
expected = np.array([0.2, 0.01, -0.9, 0.2])
y = [1, 2, 1, 0]
computed = clf._max_samples(data, y)
self.assertEqual((4,), computed.shape)
self.assertListEqual(expected.tolist(), computed.tolist())

View File

@@ -1,226 +0,0 @@
import os
import imghdr
import unittest
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import warnings
from sklearn.datasets import make_classification
from stree import Stree_grapher, Snode_graph, Snode
def get_dataset(random_state=0, n_features=3):
X, y = make_classification(
n_samples=1500,
n_features=n_features,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
n_clusters_per_class=2,
class_sep=1.5,
flip_y=0,
weights=[0.5, 0.5],
random_state=random_state,
)
return X, y
class Stree_grapher_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
self._random_state = 1
self._clf = Stree_grapher(dict(random_state=self._random_state))
self._clf.fit(*get_dataset(self._random_state, n_features=4))
super().__init__(*args, **kwargs)
@classmethod
def setUp(cls):
os.environ["TESTING"] = "1"
def test_iterator(self):
"""Check preorder iterator
"""
expected = [
"root",
"root - Down",
"root - Down - Down, <cgaf> - Leaf class=1 belief= 0.976023 counts"
"=(array([0, 1]), array([ 17, 692]))",
"root - Down - Up",
"root - Down - Up - Down, <cgaf> - Leaf class=0 belief= 0.500000 "
"counts=(array([0, 1]), array([1, 1]))",
"root - Down - Up - Up, <cgaf> - Leaf class=0 belief= 0.888889 "
"counts=(array([0, 1]), array([8, 1]))",
"root - Up, <cgaf> - Leaf class=0 belief= 0.928205 counts=(array("
"[0, 1]), array([724, 56]))",
]
computed = []
for node in self._clf:
computed.append(str(node))
self.assertListEqual(expected, computed)
def test_score(self):
X, y = get_dataset(self._random_state)
accuracy_score = self._clf.score(X, y)
yp = self._clf.predict(X)
accuracy_computed = np.mean(yp == y)
self.assertEqual(accuracy_score, accuracy_computed)
self.assertGreater(accuracy_score, 0.86)
def test_score_4dims(self):
X, y = get_dataset(self._random_state, n_features=4)
accuracy_score = self._clf.score(X, y)
self.assertEqual(accuracy_score, 0.95)
def test_save_all(self):
folder_name = os.path.join(os.sep, "tmp", "stree")
if os.path.isdir(folder_name):
os.rmdir(folder_name)
file_names = [
os.path.join(folder_name, f"STnode{i}.png") for i in range(1, 8)
]
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
self._clf.save_all(save_folder=folder_name)
for file_name in file_names:
self.assertTrue(os.path.exists(file_name))
self.assertEqual("png", imghdr.what(file_name))
os.remove(file_name)
os.rmdir(folder_name)
def test_plot_all(self):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
num_figures_before = plt.gcf().number
self._clf.plot_all()
num_figures_after = plt.gcf().number
self.assertEqual(7, num_figures_after - num_figures_before)
class Snode_graph_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
self._random_state = 1
self._clf = Stree_grapher(dict(random_state=self._random_state))
self._clf.fit(*get_dataset(self._random_state))
super().__init__(*args, **kwargs)
@classmethod
def setUp(cls):
os.environ["TESTING"] = "1"
def test_plot_size(self):
default = self._clf._tree_gr.get_plot_size()
expected = (17, 3)
self._clf._tree_gr.set_plot_size(expected)
self.assertEqual(expected, self._clf._tree_gr.get_plot_size())
self._clf._tree_gr.set_plot_size(default)
self.assertEqual(default, self._clf._tree_gr.get_plot_size())
def test_attributes_in_leaves_graph(self):
"""Check if the attributes in leaves have correct values so they form a
predictor
"""
def check_leave(node: Snode_graph):
if not node.is_leaf():
check_leave(node.get_down())
check_leave(node.get_up())
return
# Check Belief in leave
classes, card = np.unique(node._y, return_counts=True)
max_card = max(card)
min_card = min(card)
if len(classes) > 1:
try:
belief = max_card / (max_card + min_card)
except ZeroDivisionError:
belief = 0.0
else:
belief = 1
self.assertEqual(belief, node._belief)
# Check Class
class_computed = classes[card == max_card]
self.assertEqual(class_computed, node._class)
check_leave(self._clf._tree_gr)
def test_nodes_graph_coefs(self):
"""Check if the nodes of the tree have the right attributes filled
"""
def run_tree(node: Snode_graph):
if node._belief < 1:
# only exclude pure leaves
self.assertIsNotNone(node._clf)
self.assertIsNotNone(node._clf.coef_)
if node.is_leaf():
return
run_tree(node.get_down())
run_tree(node.get_up())
run_tree(self._clf._tree_gr)
def test_save_hyperplane(self):
folder_name = "/tmp/"
file_name = os.path.join(folder_name, "STnode1.png")
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
self._clf._tree_gr.save_hyperplane(folder_name)
self.assertTrue(os.path.exists(file_name))
self.assertEqual("png", imghdr.what(file_name))
os.remove(file_name)
def test_plot_hyperplane_with_distribution(self):
plt.close()
# select a pure node
node = self._clf._tree_gr.get_down().get_up().get_up()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
num_figures_before = plt.gcf().number
node.plot_hyperplane(plot_distribution=True)
num_figures_after = plt.gcf().number
self.assertEqual(1, num_figures_after - num_figures_before)
def test_plot_hyperplane_without_distribution(self):
plt.close()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
num_figures_before = plt.gcf().number
self._clf._tree_gr.plot_hyperplane(plot_distribution=False)
num_figures_after = plt.gcf().number
self.assertEqual(1, num_figures_after - num_figures_before)
def test_plot_distribution(self):
plt.close()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
num_figures_before = plt.gcf().number
self._clf._tree_gr.plot_distribution()
num_figures_after = plt.gcf().number
self.assertEqual(1, num_figures_after - num_figures_before)
def test_set_axis_limits(self):
node = Snode_graph(Snode(None, None, None, "test"))
limits = (-2, 2), (-3, 3), (-4, 4)
node.set_axis_limits(limits)
computed = node.get_axis_limits()
x, y, z = limits
xx, yy, zz = computed
self.assertEqual(x, xx)
self.assertEqual(y, yy)
self.assertEqual(z, zz)
def test_cmap_change(self):
node = Snode_graph(Snode(None, None, None, "test"))
self.assertEqual("jet", node._get_cmap())
# make node pure
node._belief = 1.0
node._class = 1
self.assertEqual("jet_r", node._get_cmap())

View File

@@ -1,9 +1,4 @@
from .Strees_test import Stree_test, Snode_test
from .Strees_grapher_test import Stree_grapher_test, Snode_graph_test
from .Stree_test import Stree_test
from .Snode_test import Snode_test
__all__ = [
"Stree_test",
"Snode_test",
"Stree_grapher_test",
"Snode_graph_test",
]
__all__ = ["Stree_test", "Snode_test"]

17
stree/tests/utils.py Normal file
View File

@@ -0,0 +1,17 @@
from sklearn.datasets import make_classification
def get_dataset(random_state=0, n_classes=2):
X, y = make_classification(
n_samples=1500,
n_features=3,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=n_classes,
n_clusters_per_class=2,
class_sep=1.5,
flip_y=0,
random_state=random_state,
)
return X, y