Files
fimdlp/samples/sample.py
2023-04-25 17:11:40 +02:00

74 lines
2.0 KiB
Python

import time
import argparse
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from fimdlp.mdlp import FImdlp
from fimdlp.cppfimdlp import CArffFiles
datasets = {
"mfeat-factors": True,
"iris": True,
"glass": True,
"liver-disorders": True,
"letter": True,
"kdd_JapaneseVowels": False,
}
ap = argparse.ArgumentParser()
ap.add_argument(
"-n",
"--min_length",
type=int,
default=3,
help="Minimum length of interval",
)
ap.add_argument(
"-m", "--max_depth", type=int, default=9999, help="Maximum depth"
)
ap.add_argument(
"-c",
"--max_cuts",
type=float,
default=0,
help="Maximum number of cut points",
)
ap.add_argument("dataset", type=str, choices=datasets.keys())
args = ap.parse_args()
relative = "" if os.path.isdir("src") else ".."
file_name = os.path.join(
relative, "src", "cppmdlp", "tests", "datasets", args.dataset
)
arff = CArffFiles()
arff.load(bytes(f"{file_name}.arff", "utf-8"))
X = arff.get_X()
y = arff.get_y()
attributes = arff.get_attributes()
attributes = [x[0].decode() for x in attributes]
df = pd.DataFrame(X, columns=attributes)
class_name = arff.get_class_name().decode()
df[class_name] = y
test = FImdlp(
min_length=args.min_length,
max_depth=args.max_depth,
max_cuts=args.max_cuts,
)
now = time.time()
test.fit(X, y)
fit_time = time.time()
print(f"Fitting ....: {fit_time - now:7.5f} seconds")
now = time.time()
Xt = test.transform(X)
print(f"Transforming: {time.time() - now:7.5f} seconds")
cut_points = test.get_cut_points()
for i, cuts in enumerate(cut_points):
print(f"Cut points for feature {attributes[i]}: {cuts}")
print(f"Min: {min(X[:, i]):6.4f} Max: {max(X[:, i]):6.4f}")
num_cuts = sum([len(x) for x in cut_points])
print(f"Total cut points ...: {num_cuts}")
print(f"Total feature states: {num_cuts + len(attributes)}")
clf = RandomForestClassifier(random_state=0)
print(
"Random Forest score with discretized data: ", clf.fit(Xt, y).score(Xt, y)
)