from sklearn.datasets import load_iris from fimdlp.mdlp import FImdlp from fimdlp.cppfimdlp import CFImdlp from sklearn.ensemble import RandomForestClassifier import numpy as np import time from math import log2 from scipy.io import arff import pandas as pd def entropy(y: np.array) -> float: """Compute entropy of a labels set Parameters ---------- y : np.array set of labels Returns ------- float entropy """ n_labels = len(y) if n_labels <= 1: return 0 counts = np.bincount(y) proportions = counts / n_labels n_classes = np.count_nonzero(proportions) if n_classes <= 1: return 0 entropy = 0.0 # Compute standard entropy. for prop in proportions: if prop != 0.0: entropy -= prop * log2(prop, 2) return entropy def information_gain( labels: np.array, labels_up: np.array, labels_dn: np.array ) -> float: imp_prev = entropy(labels) card_up = card_dn = imp_up = imp_dn = 0 if labels_up is not None: card_up = labels_up.shape[0] imp_up = entropy(labels_up) if labels_dn is not None: card_dn = labels_dn.shape[0] if labels_dn is not None else 0 imp_dn = entropy(labels_dn) samples = card_up + card_dn if samples == 0: return 0.0 else: result = ( imp_prev - (card_up / samples) * imp_up - (card_dn / samples) * imp_dn ) return result class_name = "speaker" file_name = "kdd_JapaneseVowels.arff" data = arff.loadarff(file_name) df = pd.DataFrame(data[0]) df.dropna(axis=0, how="any", inplace=True) dataset = df X = df.drop(class_name, axis=1) features = X.columns class_name = class_name y, _ = pd.factorize(df[class_name]) X = X.to_numpy() # data = load_iris() # X = data.data # y = data.target # features = data.feature_names test = FImdlp() now = time.time() test.fit(X, y, features=[i for i in (range(3, 14))]) fit_time = time.time() print("Fitting: ", fit_time - now) now = time.time() Xt = test.transform(X) print("Transforming: ", time.time() - now) print(test.get_cut_points()) clf = RandomForestClassifier(random_state=0) print(clf.fit(Xt, y).score(Xt, y)) # for proposal in [True, False]: # X = data.data # y = data.target # print("*** Proposal: ", proposal) # test = CFImdlp(debug=True, proposal=proposal) # test.fit(X[:, 0], y) # result = test.get_cut_points() # for item in result: # print( # f"Class={item['classNumber']} - ({item['start']:3d}, " # f"{item['end']:3d}) -> ({item['fromValue']:3.1f}, " # f"{item['toValue']:3.1f}]" # ) # print(test.get_discretized_values()) # print("+" * 40) # X = np.array( # [ # [5.1, 3.5, 1.4, 0.2], # [5.2, 3.0, 1.4, 0.2], # [5.3, 3.2, 1.3, 0.2], # [5.4, 3.1, 1.5, 0.2], # ] # ) # y = np.array([0, 0, 0, 1]) # print(test.fit(X[:, 0], y).transform(X[:, 0])) # result = test.get_cut_points() # for item in result: # print( # f"Class={item['classNumber']} - ({item['start']:3d}, {item['end']:3d})" # f" -> ({item['fromValue']:3.1f}, {item['toValue']:3.1f}]" # ) # print("*" * 40) # # print(Xs, ys) # # print("**********************") # # test = [(0, 3), (4, 4), (5, 5), (6, 8), (9, 9)] # # print(ys) # # for start, end in test: # # print("Testing ", start, end, ys[:end], ys[end:]) # # print("Information gain: ", information_gain(ys, ys[:end], ys[end:])) # # print(test.transform(X)) # # print(X) # # print(indices) # # print(np.array(X)[indices]) # # # k = test.cut_points(X[:, 0], y) # # # print(k) # # # k = test.cut_points_ant(X[:, 0], y) # # # print(k) # # # test.debug_points(X[:, 0], y) # # X = [5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9] # # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2] # # indices = [4, 3, 6, 8, 2, 1, 5, 0, 9, 7] # # clf = CFImdlp(debug=True, proposal=False) # # clf.fit(X, y) # # print(clf.get_cut_points()) # # y = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2] # # # To check # # indices2 = np.argsort(X) # # Xs = np.array(X)[indices2] # # ys = np.array(y)[indices2] # kdd_JapaneseVowels