Change arff library to sample.py

This commit is contained in:
2023-04-14 11:20:48 +02:00
parent 25d341aee5
commit 878cd379ee

View File

@@ -5,6 +5,7 @@ from scipy.io import arff
import pandas as pd import pandas as pd
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from fimdlp.mdlp import FImdlp from fimdlp.mdlp import FImdlp
from fimdlp.cppfimdlp import CArffFiles
datasets = { datasets = {
"mfeat-factors": True, "mfeat-factors": True,
@@ -29,13 +30,15 @@ relative = "" if os.path.isdir("src") else ".."
file_name = os.path.join( file_name = os.path.join(
relative, "src", "cppmdlp", "tests", "datasets", args.dataset relative, "src", "cppmdlp", "tests", "datasets", args.dataset
) )
data = arff.loadarff(file_name + ".arff") arff = CArffFiles()
df = pd.DataFrame(data[0]) arff.load(bytes(f"{file_name}.arff", "utf-8"))
class_column = -1 if datasets[args.dataset] else 0 X = arff.get_X()
class_name = df.columns.to_list()[class_column] y = arff.get_y()
X = df.drop(class_name, axis=1) attributes = arff.get_attributes()
y, _ = pd.factorize(df[class_name]) attributes = [x[0].decode() for x in attributes]
X = X.to_numpy() df = pd.DataFrame(X, columns=attributes)
class_name = arff.get_class_name().decode()
df[class_name] = y
test = FImdlp( test = FImdlp(
min_length=args.min_length, min_length=args.min_length,
max_depth=args.max_depth, max_depth=args.max_depth,
@@ -48,7 +51,13 @@ print(f"Fitting ....: {fit_time - now:7.5f} seconds")
now = time.time() now = time.time()
Xt = test.transform(X) Xt = test.transform(X)
print(f"Transforming: {time.time() - now:7.5f} seconds") print(f"Transforming: {time.time() - now:7.5f} seconds")
print(test.get_cut_points()) cut_points = test.get_cut_points()
for i, cuts in enumerate(cut_points):
print(f"Cut points for feature {attributes[i]}: {cuts}")
print(f"Min: {min(X[:, i]):6.4f} Max: {max(X[:, i]):6.4f}")
num_cuts = sum([len(x) for x in cut_points])
print(f"Total cut points ...: {num_cuts}")
print(f"Total feature states: {num_cuts + len(attributes)}")
clf = RandomForestClassifier(random_state=0) clf = RandomForestClassifier(random_state=0)
print( print(
"Random Forest score with discretized data: ", clf.fit(Xt, y).score(Xt, y) "Random Forest score with discretized data: ", clf.fit(Xt, y).score(Xt, y)