Files
fimdlp/sample.py
2022-12-08 20:16:00 +01:00

158 lines
4.1 KiB
Python

from sklearn.datasets import load_iris
from fimdlp.mdlp import FImdlp
from fimdlp.cppfimdlp import CFImdlp
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import time
from math import log2
from scipy.io import arff
import pandas as pd
def entropy(y: np.array) -> float:
"""Compute entropy of a labels set
Parameters
----------
y : np.array
set of labels
Returns
-------
float
entropy
"""
n_labels = len(y)
if n_labels <= 1:
return 0
counts = np.bincount(y)
proportions = counts / n_labels
n_classes = np.count_nonzero(proportions)
if n_classes <= 1:
return 0
entropy = 0.0
# Compute standard entropy.
for prop in proportions:
if prop != 0.0:
entropy -= prop * log2(prop, 2)
return entropy
def information_gain(
labels: np.array, labels_up: np.array, labels_dn: np.array
) -> float:
imp_prev = entropy(labels)
card_up = card_dn = imp_up = imp_dn = 0
if labels_up is not None:
card_up = labels_up.shape[0]
imp_up = entropy(labels_up)
if labels_dn is not None:
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
imp_dn = entropy(labels_dn)
samples = card_up + card_dn
if samples == 0:
return 0.0
else:
result = (
imp_prev
- (card_up / samples) * imp_up
- (card_dn / samples) * imp_dn
)
return result
class_name = "speaker"
file_name = "kdd_JapaneseVowels.arff"
data = arff.loadarff(file_name)
df = pd.DataFrame(data[0])
df.dropna(axis=0, how="any", inplace=True)
dataset = df
X = df.drop(class_name, axis=1)
features = X.columns
class_name = class_name
y, _ = pd.factorize(df[class_name])
X = X.to_numpy()
# data = load_iris()
# X = data.data
# y = data.target
# features = data.feature_names
test = FImdlp()
now = time.time()
test.fit(X, y, features=[i for i in (range(3, 14))])
fit_time = time.time()
print("Fitting: ", fit_time - now)
now = time.time()
Xt = test.transform(X)
print("Transforming: ", time.time() - now)
print(test.get_cut_points())
clf = RandomForestClassifier(random_state=0)
print(clf.fit(Xt, y).score(Xt, y))
# for proposal in [True, False]:
# X = data.data
# y = data.target
# print("*** Proposal: ", proposal)
# test = CFImdlp(debug=True, proposal=proposal)
# test.fit(X[:, 0], y)
# result = test.get_cut_points()
# for item in result:
# print(
# f"Class={item['classNumber']} - ({item['start']:3d}, "
# f"{item['end']:3d}) -> ({item['fromValue']:3.1f}, "
# f"{item['toValue']:3.1f}]"
# )
# print(test.get_discretized_values())
# print("+" * 40)
# X = np.array(
# [
# [5.1, 3.5, 1.4, 0.2],
# [5.2, 3.0, 1.4, 0.2],
# [5.3, 3.2, 1.3, 0.2],
# [5.4, 3.1, 1.5, 0.2],
# ]
# )
# y = np.array([0, 0, 0, 1])
# print(test.fit(X[:, 0], y).transform(X[:, 0]))
# result = test.get_cut_points()
# for item in result:
# print(
# f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})"
# f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]"
# )
# print("*" * 40)
# # print(Xs, ys)
# # print("**********************")
# # test = [(0, 3), (4, 4), (5, 5), (6, 8), (9, 9)]
# # print(ys)
# # for start, end in test:
# # print("Testing ", start, end, ys[:end], ys[end:])
# # print("Information gain: ", information_gain(ys, ys[:end], ys[end:]))
# # print(test.transform(X))
# # print(X)
# # print(indices)
# # print(np.array(X)[indices])
# # # k = test.cut_points(X[:, 0], y)
# # # print(k)
# # # k = test.cut_points_ant(X[:, 0], y)
# # # print(k)
# # # test.debug_points(X[:, 0], y)
# # X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9]
# # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
# # indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7]
# # clf = CFImdlp(debug=True, proposal=False)
# # clf.fit(X, y)
# # print(clf.get_cut_points())
# # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2]
# # # To check
# # indices2 = np.argsort(X)
# # Xs = np.array(X)[indices2]
# # ys = np.array(y)[indices2]
# kdd_JapaneseVowels