diff --git a/k.py b/k.py index c8bd820..1c76ea2 100644 --- a/k.py +++ b/k.py @@ -21,13 +21,15 @@ mfsd = MFS(discrete=True) # X = data[:, -1:] # y = data[:, -1] - -data, meta = arff.loadarff( - "/Users/rmontanana/Code/stree_datasets/data/tanveer/balance-scale/balance-scale.arff" +filename = ( + "/Users/rmontanana/Code/stree_datasets/data/tanveer/conn-bench-sonar-min" + "es-rocks/conn-bench-sonar-mines-rocks.arff" ) -train = np.array([data["f1"], data["f2"], data["f3"], data["f4"]]) -y = data["clase"] +data, meta = arff.loadarff(filename) +train = np.array([data[i] for i in meta]) X = train.T +X = X[:, :-1].astype("float64") +y = data["clase"] for c in range(X.shape[1]): @@ -68,10 +70,10 @@ clf = Stree(random_state=0) subf = fcfb_f print("fcfb", clf.fit(X[:, subf], y).score(X[:, subf], y)) -for c in range(X.shape[1]): - for k in range(X.shape[1]): - ac = 0 - for v in range(X[:, c].shape[0]): - if X[v, c] == X[v, k]: - ac += 1 - print(f"{c} {k} {ac}") +# for c in range(X.shape[1]): +# for k in range(X.shape[1]): +# ac = 0 +# for v in range(X[:, c].shape[0]): +# if X[v, c] == X[v, k]: +# ac += 1 +# print(f"{c} {k} {ac}") diff --git a/mfs/Metrics.py b/mfs/Metrics.py index c91f705..13ecf24 100755 --- a/mfs/Metrics.py +++ b/mfs/Metrics.py @@ -27,6 +27,90 @@ class Metrics: """ return Metrics._compute_mi_cd(x, y, n_neighbors=3) + @staticmethod + def information_gain_cont_features(xa, xb): + """Measures the reduction in uncertainty about the value of xb when the + value of xa continuous is known (also called mutual information) + (https://www.sciencedirect.com/science/article/pii/S0020025519303603) + + Parameters + ---------- + xa : np.array + values of the continuous variable + xb : np.array + values of the continuous variable + + Returns + ------- + float + Information gained + """ + return Metrics._compute_mi_cc(xa, xb, n_neighbors=3) + + @staticmethod + def _compute_mi_cc(x, y, n_neighbors): + """Compute mutual information between two continuous variables. + + Parameters + ---------- + x, y : ndarray, shape (n_samples,) + Samples of two continuous random variables, must have an identical + shape. + + n_neighbors : int + Number of nearest neighbors to search for each point, see [1]_. + + Returns + ------- + mi : float + Estimated mutual information. If it turned out to be negative it is + replace by 0. + + Notes + ----- + True mutual information can't be negative. If its estimate by a numerical + method is negative, it means (providing the method is adequate) that the + mutual information is close to 0 and replacing it by 0 is a reasonable + strategy. + + References + ---------- + .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual + information". Phys. Rev. E 69, 2004. + """ + + n_samples = x.size + + x = x.reshape((-1, 1)) + y = y.reshape((-1, 1)) + xy = np.hstack((x, y)) + + # Here we rely on NearestNeighbors to select the fastest algorithm. + nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors) + + nn.fit(xy) + radius = nn.kneighbors()[0] + radius = np.nextafter(radius[:, -1], 0) + + # KDTree is explicitly fit to allow for the querying of number of + # neighbors within a specified radius + kd = KDTree(x, metric="chebyshev") + nx = kd.query_radius(x, radius, count_only=True, return_distance=False) + nx = np.array(nx) - 1.0 + + kd = KDTree(y, metric="chebyshev") + ny = kd.query_radius(y, radius, count_only=True, return_distance=False) + ny = np.array(ny) - 1.0 + + mi = ( + digamma(n_samples) + + digamma(n_neighbors) + - np.mean(digamma(nx + 1)) + - np.mean(digamma(ny + 1)) + ) + + return max(0, mi) + @staticmethod def _compute_mi_cd(c, d, n_neighbors): """Compute mutual information between continuous and discrete @@ -189,6 +273,33 @@ class Metrics: ) ) + @staticmethod + def symmetrical_unc_continuous_features(x, y): + """Compute symmetrical uncertainty. Using Greg Ver Steeg's npeet + https://github.com/gregversteeg/NPEET + + Parameters + ---------- + x : np.array + values of the continuous variable + y : np.array + array of labels + + Returns + ------- + float + symmetrical uncertainty + """ + + return ( + 2.0 + * Metrics.information_gain_cont_features(x, y) + / ( + Metrics.differential_entropy(x, k=len(x) - 1) + + Metrics.entropy(y) + ) + ) + @staticmethod def symmetrical_uncertainty(x, y): """Compute symmetrical uncertainty. Normalize* information gain (mutual diff --git a/mfs/Selection.py b/mfs/Selection.py index 539fea2..f979153 100755 --- a/mfs/Selection.py +++ b/mfs/Selection.py @@ -33,6 +33,11 @@ class MFS: if discrete else Metrics.symmetrical_unc_continuous ) + self.symmetrical_uncertainty_features = ( + Metrics.symmetrical_uncertainty + if discrete + else Metrics.symmetrical_unc_continuous_features + ) self._fitted = False def _initialize(self, X, y): @@ -93,7 +98,7 @@ class MFS: if (feature_a, feature_b) not in self._su_features: self._su_features[ (feature_a, feature_b) - ] = self.symmetrical_uncertainty( + ] = self.symmetrical_uncertainty_features( self.X_[:, feature_a], self.X_[:, feature_b] ) return self._su_features[(feature_a, feature_b)] @@ -148,7 +153,7 @@ class MFS: candidates.append(first_candidate) self._scores.append(s_list[first_candidate]) while continue_condition: - merit = float_info.min + merit = -float_info.min id_selected = None for idx, feature in enumerate(feature_order): candidates.append(feature) @@ -157,9 +162,6 @@ class MFS: id_selected = idx merit = merit_new candidates.pop() - if id_selected is None: - # Every merit computed is 0 - break candidates.append(feature_order[id_selected]) self._scores.append(merit) del feature_order[id_selected]