mirror of
https://github.com/Doctorado-ML/Stree_datasets.git
synced 2025-08-18 08:56:01 +00:00
Commit Inicial
This commit is contained in:
396
experimentation/Database.py
Normal file
396
experimentation/Database.py
Normal file
@@ -0,0 +1,396 @@
|
||||
import os
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
from abc import ABC
|
||||
from typing import List
|
||||
|
||||
from .Models import ModelBase
|
||||
from .Utils import TextColor, MySQL
|
||||
|
||||
|
||||
class BD(ABC):
|
||||
_folder = "./data/results"
|
||||
_con = None
|
||||
|
||||
def __init__(self, host: str, model: ModelBase) -> None:
|
||||
self._model = model
|
||||
self._host = host
|
||||
self._database = os.path.join(
|
||||
self._folder, f"{host}_{model}_experiments.sqlite3"
|
||||
)
|
||||
self._con = sqlite3.connect(self._database)
|
||||
# return dict as a result of select
|
||||
self._con.row_factory = sqlite3.Row
|
||||
self._check_build()
|
||||
# accumulators used in reports
|
||||
self._best = self._worse = self._equal = 0
|
||||
|
||||
def _check_build(self) -> None:
|
||||
"""Check if the tables are created and create them if they don't"""
|
||||
commands = [
|
||||
'create table if not exists "outcomes" ("dataset" varchar NOT NULL'
|
||||
',"date" datetime NOT NULL DEFAULT NULL,"fit_time" num NOT NULL '
|
||||
'DEFAULT NULL, "fit_time_std" num, "score_time" num NOT NULL '
|
||||
'DEFAULT NULL, "score_time_std" num, "train_score" num NOT NULL '
|
||||
'DEFAULT NULL, "train_score_std" num, "test_score" num NOT NULL '
|
||||
'DEFAULT NULL, "test_score_std" num, "parameters" text DEFAULT '
|
||||
'NULL, "normalize" int NOT NULL DEFAULT 0, "standardize" int NOT '
|
||||
"NULL DEFAULT 0, PRIMARY KEY (dataset, date));",
|
||||
'create table if not exists "hyperparameters" ("dataset" varchar '
|
||||
'NOT NULL,"date" datetime NOT NULL DEFAULT NULL,"fit_time" num NOT'
|
||||
' NULL DEFAULT NULL, "fit_time_std" num, "score_time" num NOT NULL'
|
||||
' DEFAULT NULL, "score_time_std" num, "train_score" num NOT NULL '
|
||||
'DEFAULT NULL, "train_score_std" num, "test_score" num NOT NULL '
|
||||
'DEFAULT NULL, "test_score_std" num, "parameters" text DEFAULT '
|
||||
'NULL, "normalize" int NOT NULL DEFAULT 0, "standardize" int NOT '
|
||||
"NULL DEFAULT 0, PRIMARY KEY (dataset, normalize, standardize));",
|
||||
'create table if not exists "reference" ("dataset" varchar NOT '
|
||||
'NULL,"score" num NOT NULL, PRIMARY KEY (dataset));',
|
||||
"INSERT or replace INTO reference (dataset, score) VALUES "
|
||||
"('balance-scale', '0.904628'),"
|
||||
"('balloons', '0.6625'),"
|
||||
"('breast-cancer-wisc-diag', '0.974345'),"
|
||||
"('breast-cancer-wisc-prog', '0.79934'),"
|
||||
"('breast-cancer-wisc', '0.970256'),"
|
||||
"('breast-cancer', '0.73824'),"
|
||||
"('cardiotocography-10clases', '0.827761'),"
|
||||
"('cardiotocography-3clases', '0.920134'),"
|
||||
"('conn-bench-sonar-mines-rocks', '0.833654'),"
|
||||
"('cylinder-bands', '0.769141'),"
|
||||
"('dermatology', '0.973278'),"
|
||||
"('echocardiogram', '0.848527'),"
|
||||
"('fertility', '0.884'),"
|
||||
"('haberman-survival', '0.739254'),"
|
||||
"('heart-hungarian', '0.820475'),"
|
||||
"('hepatitis', '0.823203'),"
|
||||
"('ilpd-indian-liver', '0.715028'),"
|
||||
"('ionosphere', '0.944215'),"
|
||||
"('iris', '0.978656'),"
|
||||
"('led-display', '0.7102'),"
|
||||
"('libras', '0.891111'),"
|
||||
"('low-res-spect', '0.90282'),"
|
||||
"('lymphography', '0.855405'),"
|
||||
"('mammographic', '0.827472'),"
|
||||
"('molec-biol-promoter', '0.818269'),"
|
||||
"('musk-1', '0.876471'),"
|
||||
"('oocytes_merluccius_nucleus_4d', '0.839963'),"
|
||||
"('oocytes_merluccius_states_2f', '0.929963'),"
|
||||
"('oocytes_trisopterus_nucleus_2f', '0.833333'),"
|
||||
"('oocytes_trisopterus_states_5b', '0.931579'),"
|
||||
"('parkinsons', '0.920221'),"
|
||||
"('pima', '0.767188'),"
|
||||
"('pittsburg-bridges-MATERIAL', '0.864286'),"
|
||||
"('pittsburg-bridges-REL-L', '0.695929'),"
|
||||
"('pittsburg-bridges-SPAN', '0.68913'),"
|
||||
"('pittsburg-bridges-T-OR-D', '0.87437'),"
|
||||
"('planning', '0.725579'),"
|
||||
"('post-operative', '0.711742'),"
|
||||
"('seeds', '0.956303'),"
|
||||
"('statlog-australian-credit', '0.678281'),"
|
||||
"('statlog-german-credit', '0.7562'),"
|
||||
"('statlog-heart', '0.842299'),"
|
||||
"('statlog-image', '0.976194'),"
|
||||
"('statlog-vehicle', '0.800673'),"
|
||||
"('synthetic-control', '0.990333'),"
|
||||
"('tic-tac-toe', '0.985385'),"
|
||||
"('vertebral-column-2clases', '0.849153'),"
|
||||
"('wine', '0.993281'),"
|
||||
"('zoo', '0.960385')",
|
||||
]
|
||||
for command in commands:
|
||||
self.execute(command)
|
||||
|
||||
def mirror(
|
||||
self, exp_type, dataset, normalize, standardize, accuracy, parameters
|
||||
) -> None:
|
||||
"""Create a record in MySQL database
|
||||
|
||||
:param record: data to insert in database
|
||||
:type record: dict
|
||||
"""
|
||||
database = MySQL.get_connection()
|
||||
command_insert = (
|
||||
"replace into results (date, time, type, accuracy, "
|
||||
"dataset, classifier, norm, stand, parameters) values (%s, %s, "
|
||||
"%s, %s, %s, %s, %s, %s, %s)"
|
||||
)
|
||||
now = datetime.now()
|
||||
date = now.strftime("%Y-%m-%d")
|
||||
time = now.strftime("%H:%M:%S")
|
||||
values = (
|
||||
date,
|
||||
time,
|
||||
exp_type,
|
||||
accuracy,
|
||||
dataset,
|
||||
self._model,
|
||||
normalize,
|
||||
standardize,
|
||||
parameters,
|
||||
)
|
||||
cursor = database.cursor()
|
||||
cursor.execute(command_insert, values)
|
||||
database.commit()
|
||||
|
||||
def execute(self, command: str) -> None:
|
||||
c = self._con.cursor()
|
||||
c.execute(command)
|
||||
c.close()
|
||||
self._con.commit()
|
||||
|
||||
def header(
|
||||
self,
|
||||
title: str,
|
||||
lengths: List[int],
|
||||
fields: List[str],
|
||||
exclude_params,
|
||||
) -> str:
|
||||
length = 148 if exclude_params else 170
|
||||
title += f" -- {self._model} in {self._host} --"
|
||||
output = "\n" + "*" * length + "\n"
|
||||
num = (length - len(title) - 2) // 2
|
||||
num2 = length - len(title) - 2 - 2 * num
|
||||
output += "*" + " " * num + title + " " * (num + num2) + "*\n"
|
||||
output += "*" * length + "\n\n"
|
||||
for field, length in zip(fields, lengths):
|
||||
output += ("{0:" + str(length) + "} ").format(field)
|
||||
output += "\n"
|
||||
for length in lengths:
|
||||
output += "=" * length + " "
|
||||
return output
|
||||
|
||||
def check_result(self, test, reference) -> str:
|
||||
if test > reference:
|
||||
self._best += 1
|
||||
result = "+"
|
||||
elif test < reference:
|
||||
self._worse += 1
|
||||
result = "-"
|
||||
else:
|
||||
self._equal += 1
|
||||
result = "="
|
||||
return result
|
||||
|
||||
def report_line(self, data, exclude_params):
|
||||
data = list(data)
|
||||
dataset = data.pop(0)
|
||||
reference = data.pop()
|
||||
_ = data.pop() # remove dataset name of inner join
|
||||
exec_date = data.pop(0)
|
||||
standardize = data.pop()
|
||||
normalize = data.pop()
|
||||
parameters = data.pop()
|
||||
result = self.check_result(data[6], reference)
|
||||
if exclude_params:
|
||||
parameters = ""
|
||||
output = ""
|
||||
index = 0
|
||||
for item in data:
|
||||
if index % 2:
|
||||
fact = f" (+/- {item * 2:6.2f}) "
|
||||
else:
|
||||
if index > 3:
|
||||
fact = f"{item:7.4f}"
|
||||
else:
|
||||
fact = f"{item:7.2f}"
|
||||
output += fact
|
||||
index += 1
|
||||
return (
|
||||
f"{dataset:30s} {exec_date:10s} {normalize} {standardize} {output}"
|
||||
f"{reference:1.5f} {result} {parameters}"
|
||||
)
|
||||
|
||||
def report_header(self, title, exclude_params):
|
||||
lengths = [30, 19, 1, 1, 20, 20, 20, 20, 9, 21]
|
||||
fields = [
|
||||
"Dataset",
|
||||
"Date",
|
||||
"N",
|
||||
"S",
|
||||
"Fit Time (sec)",
|
||||
"Score Time (sec)",
|
||||
"Score on Train",
|
||||
"Score on Test",
|
||||
"Reference",
|
||||
"Parameters",
|
||||
]
|
||||
if exclude_params:
|
||||
fields.pop()
|
||||
lengths.pop()
|
||||
return self.header(title, lengths, fields, exclude_params)
|
||||
|
||||
def report_footer(self):
|
||||
print(
|
||||
TextColor.GREEN
|
||||
+ f"{self._model} has better results {self._best:2d} times"
|
||||
)
|
||||
print(
|
||||
TextColor.RED
|
||||
+ f"{self._model} has worse results {self._worse:2d} times"
|
||||
)
|
||||
print(
|
||||
TextColor.MAGENTA
|
||||
+ f"{self._model} has equal results {self._equal:2d} times"
|
||||
)
|
||||
|
||||
|
||||
class Outcomes(BD):
|
||||
def __init__(self, host: str, model):
|
||||
self._table = "outcomes"
|
||||
super().__init__(host=host, model=model)
|
||||
|
||||
def store(self, dataset, normalize, standardize, parameters, results):
|
||||
outcomes = ["fit_time", "score_time", "train_score", "test_score"]
|
||||
data = ""
|
||||
for index in outcomes:
|
||||
data += ", " + str(results[index].mean()) + ", "
|
||||
data += str(results[index].std())
|
||||
command = (
|
||||
f"insert or replace into {self._table} ('dataset', 'parameters', "
|
||||
"'date', 'normalize', 'standardize'"
|
||||
)
|
||||
for field in outcomes:
|
||||
command += f",'{field}', '{field}_std'"
|
||||
command += f") values('{dataset}', '{parameters}', DateTime('now', "
|
||||
command += f"'localtime'), '{int(normalize)}', '{int(standardize)}'"
|
||||
command += data + ")"
|
||||
command = command.replace("nan", "null")
|
||||
self.execute(command)
|
||||
self.mirror(
|
||||
"crossval",
|
||||
dataset,
|
||||
normalize,
|
||||
standardize,
|
||||
float(results["test_score"].mean()),
|
||||
parameters,
|
||||
)
|
||||
|
||||
def report(self, dataset, exclude_params):
|
||||
cursor = self._con.cursor()
|
||||
suffix = "" if dataset == "all" else f"WHERE dataset='{dataset}'"
|
||||
cursor.execute(
|
||||
f"SELECT * FROM {self._table} o {suffix} inner join reference r on"
|
||||
" o.dataset=r.dataset order by dataset, date desc;"
|
||||
)
|
||||
records = cursor.fetchall()
|
||||
num_records = len(records)
|
||||
title = f"5 Folds Cross Validation: {dataset} - {num_records} records"
|
||||
print(
|
||||
TextColor.HEADER
|
||||
+ self.report_header(title, exclude_params)
|
||||
+ TextColor.ENDC
|
||||
)
|
||||
color = TextColor.LINE2
|
||||
for record in records:
|
||||
color = (
|
||||
TextColor.LINE1
|
||||
if color == TextColor.LINE2
|
||||
else TextColor.LINE2
|
||||
)
|
||||
print(
|
||||
color
|
||||
+ self.report_line(record, exclude_params)
|
||||
+ TextColor.ENDC
|
||||
)
|
||||
if records == []:
|
||||
print(
|
||||
TextColor.WARNING
|
||||
+ " No records yet"
|
||||
+ TextColor.ENDC
|
||||
)
|
||||
else:
|
||||
self.report_footer()
|
||||
cursor.close()
|
||||
|
||||
|
||||
class Hyperparameters(BD):
|
||||
def __init__(self, host: str, model):
|
||||
self._table = "hyperparameters"
|
||||
super().__init__(host=host, model=model)
|
||||
|
||||
def store(
|
||||
self,
|
||||
dataset,
|
||||
time,
|
||||
grid,
|
||||
parameters,
|
||||
normalize,
|
||||
standardize,
|
||||
grid_type,
|
||||
):
|
||||
rosetta = [
|
||||
("mean_fit_time", "fit_time"),
|
||||
("std_fit_time", "fit_time_std"),
|
||||
("mean_score_time", "score_time"),
|
||||
("std_score_time", "score_time_std"),
|
||||
("mean_test_score", "test_score"),
|
||||
("std_test_score", "test_score_std"),
|
||||
("mean_train_score", "train_score"),
|
||||
("std_train_score", "train_score_std"),
|
||||
("params", "parameters"),
|
||||
]
|
||||
# load outcomes vector
|
||||
outcomes = {}
|
||||
for item, bd_item in rosetta:
|
||||
outcomes[bd_item] = grid.cv_results_[item][grid.best_index_]
|
||||
outcomes["parameters"] = parameters
|
||||
outcomes["normalize"] = int(normalize)
|
||||
outcomes["standardize"] = int(standardize)
|
||||
rosetta.append(("_", "normalize"))
|
||||
rosetta.append(("_", "standardize"))
|
||||
command = f"insert or replace into {self._table} ('dataset', 'date'"
|
||||
command_values = f"values ('{dataset}', DateTime('now', 'localtime')"
|
||||
for _, item in rosetta:
|
||||
command += f", '{item}'"
|
||||
command_values += (
|
||||
f", {outcomes[item]}"
|
||||
if item != "parameters"
|
||||
else f", '{outcomes[item]}'"
|
||||
)
|
||||
command += ") "
|
||||
command_values += ")"
|
||||
self.execute(command + command_values)
|
||||
accuracy = float(outcomes["test_score"])
|
||||
self.mirror(
|
||||
grid_type, dataset, normalize, standardize, accuracy, parameters
|
||||
)
|
||||
|
||||
def report(self, dataset, exclude_params):
|
||||
cursor = self._con.cursor()
|
||||
cursor.execute(
|
||||
f"SELECT * FROM {self._table} h inner join reference r on "
|
||||
"r.dataset=h.dataset order by dataset, date desc;"
|
||||
)
|
||||
records = cursor.fetchall()
|
||||
num_records = len(records)
|
||||
title = f"Grid Searches done so far - {num_records} records"
|
||||
print(
|
||||
TextColor.HEADER
|
||||
+ self.report_header(title, exclude_params)
|
||||
+ TextColor.ENDC
|
||||
)
|
||||
color = TextColor.LINE2
|
||||
for record in records:
|
||||
color = (
|
||||
TextColor.LINE1
|
||||
if color == TextColor.LINE2
|
||||
else TextColor.LINE2
|
||||
)
|
||||
print(
|
||||
color
|
||||
+ self.report_line(record, exclude_params)
|
||||
+ TextColor.ENDC
|
||||
)
|
||||
cursor.close()
|
||||
self.report_footer()
|
||||
|
||||
def get_params(self, dataset):
|
||||
cursor = self._con.cursor()
|
||||
cursor.execute(
|
||||
f"SELECT parameters, normalize, standardize FROM {self._table} "
|
||||
f"where dataset='{dataset}' order by test_score desc;"
|
||||
)
|
||||
record = cursor.fetchone()
|
||||
if record is None:
|
||||
raise ValueError(f"parameters not found for dataset {dataset}")
|
||||
return record["parameters"], record["normalize"], record["standardize"]
|
138
experimentation/Experiments.py
Normal file
138
experimentation/Experiments.py
Normal file
@@ -0,0 +1,138 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import warnings
|
||||
|
||||
from sklearn.model_selection import GridSearchCV, cross_validate
|
||||
|
||||
from . import Models
|
||||
from .Database import Hyperparameters, Outcomes
|
||||
from .Sets import Datasets
|
||||
|
||||
|
||||
class Experiment:
|
||||
def __init__(
|
||||
self, random_state: int, model: str, host: str, set_of_files: str
|
||||
) -> None:
|
||||
self._random_state = random_state
|
||||
self._model_name = model
|
||||
self._set_of_files = set_of_files
|
||||
self._type = getattr(
|
||||
Models,
|
||||
f"Model{model[0].upper() + model[1:]}",
|
||||
)
|
||||
self._clf = self._type(random_state=self._random_state)
|
||||
self._host = host
|
||||
# used in gridsearch with ensembles to take best hyperparams of
|
||||
# base class or gridsearch these hyperparams as well
|
||||
self._base_params = "any"
|
||||
|
||||
def set_base_params(self, base_params: str) -> None:
|
||||
self._base_params = base_params
|
||||
|
||||
def cross_validation(self, dataset: str) -> None:
|
||||
hyperparams = Hyperparameters(host=self._host, model=self._model_name)
|
||||
try:
|
||||
parameters, normalize, standardize = hyperparams.get_params(
|
||||
dataset
|
||||
)
|
||||
except ValueError:
|
||||
print(f"*** {dataset} not trained")
|
||||
return
|
||||
datasets = Datasets(
|
||||
normalize=normalize,
|
||||
standardize=standardize,
|
||||
set_of_files=self._set_of_files,
|
||||
)
|
||||
parameters = json.loads(parameters)
|
||||
X, y = datasets.load(dataset)
|
||||
# init cross validation object just in case consecutive experiments
|
||||
self._clf = self._type(random_state=self._random_state)
|
||||
model = self._clf.get_model().set_params(**parameters)
|
||||
self._num_warnings = 0
|
||||
warnings.warn = self._warn
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore")
|
||||
# Also affect subprocesses
|
||||
os.environ["PYTHONWARNINGS"] = "ignore"
|
||||
results = cross_validate(
|
||||
model, X, y, return_train_score=True, n_jobs=-1
|
||||
)
|
||||
outcomes = Outcomes(host=self._host, model=self._model_name)
|
||||
parameters = json.dumps(parameters, sort_keys=True)
|
||||
outcomes.store(dataset, normalize, standardize, parameters, results)
|
||||
if self._num_warnings > 0:
|
||||
print(f"{self._num_warnings} warnings have happend")
|
||||
|
||||
def grid_search(
|
||||
self, dataset: str, normalize: bool, standardize: bool
|
||||
) -> None:
|
||||
"""First of all if the modle is an ensemble search for the best
|
||||
hyperparams found in gridsearch for base model and overrides
|
||||
normalize and standardize
|
||||
"""
|
||||
hyperparams = Hyperparameters(host=self._host, model=self._model_name)
|
||||
model = self._clf.get_model()
|
||||
hyperparameters = self._clf.get_parameters()
|
||||
grid_type = "gridsearch"
|
||||
if (
|
||||
isinstance(self._clf, Models.Ensemble)
|
||||
and self._base_params == "best"
|
||||
):
|
||||
hyperparams_base = Hyperparameters(
|
||||
host=self._host, model=self._clf._base_model.get_model_name()
|
||||
)
|
||||
try:
|
||||
# Get best hyperparameters obtained in gridsearch for base clf
|
||||
(
|
||||
base_hyperparams,
|
||||
normalize,
|
||||
standardize,
|
||||
) = hyperparams_base.get_params(dataset)
|
||||
# Merge hyperparameters with the ensemble ones
|
||||
base_hyperparams = json.loads(base_hyperparams)
|
||||
hyperparameters = self._clf.merge_parameters(base_hyperparams)
|
||||
grid_type = "gridbest"
|
||||
except ValueError:
|
||||
pass
|
||||
dt = Datasets(
|
||||
normalize=normalize,
|
||||
standardize=standardize,
|
||||
set_of_files=self._set_of_files,
|
||||
)
|
||||
X, y = dt.load(dataset)
|
||||
self._num_warnings = 0
|
||||
warnings.warn = self._warn
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore")
|
||||
# Also affect subprocesses
|
||||
os.environ["PYTHONWARNINGS"] = "ignore"
|
||||
grid_search = GridSearchCV(
|
||||
model,
|
||||
return_train_score=True,
|
||||
param_grid=hyperparameters,
|
||||
n_jobs=-1,
|
||||
)
|
||||
start_time = time.time()
|
||||
grid_search.fit(X, y)
|
||||
time_spent = time.time() - start_time
|
||||
parameters = json.dumps(
|
||||
self._clf.modified_parameters(
|
||||
grid_search.best_estimator_.get_params()
|
||||
),
|
||||
sort_keys=True,
|
||||
)
|
||||
hyperparams.store(
|
||||
dataset,
|
||||
time_spent,
|
||||
grid_search,
|
||||
parameters,
|
||||
normalize,
|
||||
standardize,
|
||||
grid_type,
|
||||
)
|
||||
if self._num_warnings > 0:
|
||||
print(f"{self._num_warnings} warnings have happend")
|
||||
|
||||
def _warn(self, *args, **kwargs) -> None:
|
||||
self._num_warnings += 1
|
188
experimentation/Models.py
Normal file
188
experimentation/Models.py
Normal file
@@ -0,0 +1,188 @@
|
||||
from stree import Stree
|
||||
from typing import Union, Optional, List
|
||||
from abc import ABC
|
||||
from sklearn.ensemble import (
|
||||
AdaBoostClassifier, # type: ignore
|
||||
BaggingClassifier, # type: ignore
|
||||
)
|
||||
from sklearn.ensemble import BaseEnsemble # type: ignore
|
||||
from sklearn.base import BaseEstimator # type: ignore
|
||||
from sklearn.svm import LinearSVC # type: ignore
|
||||
from sklearn.tree import DecisionTreeClassifier # type: ignore
|
||||
from odte import Odte
|
||||
|
||||
|
||||
class ModelBase(ABC):
|
||||
def __init__(self, random_state: Optional[int]):
|
||||
self._random_state = random_state
|
||||
|
||||
def get_model_name(self) -> str:
|
||||
return self._model_name
|
||||
|
||||
def get_model(self) -> Union[BaseEnsemble, BaseEstimator]:
|
||||
return self._clf
|
||||
|
||||
def get_parameters(self) -> dict:
|
||||
return self._param_grid
|
||||
|
||||
def modified_parameters(self, optimum_parameters) -> dict:
|
||||
result = dict()
|
||||
# useful for ensembles
|
||||
excluded = ["base_estimator"]
|
||||
default_parameters = type(self._clf)().get_params()
|
||||
for key, data in optimum_parameters.items():
|
||||
if (
|
||||
key not in default_parameters
|
||||
or default_parameters[key] != data
|
||||
) and key not in excluded:
|
||||
result[key] = data
|
||||
return result
|
||||
|
||||
|
||||
class ModelStree(ModelBase):
|
||||
def __init__(self, random_state: Optional[int] = None) -> None:
|
||||
self._clf = Stree()
|
||||
super().__init__(random_state)
|
||||
self._model_name = "stree"
|
||||
C = [0.05, 0.2, 0.55, 7, 55, 1e4]
|
||||
max_iter = [1e4, 1e5, 1e6]
|
||||
gamma = [1e-1, 1, 1e1]
|
||||
max_features = [None, "auto"]
|
||||
split_criteria = ["impurity", "max_samples"]
|
||||
self._param_grid = [
|
||||
{
|
||||
"random_state": [self._random_state],
|
||||
"C": C,
|
||||
"max_iter": max_iter,
|
||||
"split_criteria": split_criteria,
|
||||
"max_features": max_features,
|
||||
},
|
||||
{
|
||||
"random_state": [self._random_state],
|
||||
"kernel": ["rbf"],
|
||||
"C": C,
|
||||
"gamma": gamma,
|
||||
"max_iter": max_iter,
|
||||
"split_criteria": split_criteria,
|
||||
"max_features": max_features,
|
||||
},
|
||||
{
|
||||
"random_state": [self._random_state],
|
||||
"kernel": ["poly"],
|
||||
"degree": [3, 5],
|
||||
"C": C,
|
||||
"gamma": gamma,
|
||||
"max_iter": max_iter,
|
||||
"split_criteria": split_criteria,
|
||||
"max_features": max_features,
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class ModelSVC(ModelBase):
|
||||
def __init__(self, random_state: Optional[int] = None) -> None:
|
||||
super().__init__(random_state)
|
||||
self._clf = LinearSVC()
|
||||
self._model_name = "svc"
|
||||
max_iter = [1e4, 1e5, 1e6]
|
||||
self._param_grid = [
|
||||
{
|
||||
"random_state": [self._random_state],
|
||||
"C": [1, 55, 1e4],
|
||||
"max_iter": max_iter,
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class ModelDecisionTree(ModelBase):
|
||||
def __init__(self, random_state: Optional[int] = None) -> None:
|
||||
super().__init__(random_state)
|
||||
self._clf = DecisionTreeClassifier()
|
||||
self._model_name = "dtree"
|
||||
self._param_grid = [
|
||||
{
|
||||
"random_state": [self._random_state],
|
||||
"max_features": [None, "log2", "auto"],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class Ensemble(ModelBase):
|
||||
def __init__(
|
||||
self,
|
||||
random_state: Optional[int] = 0,
|
||||
base_model: Union[BaseEnsemble, BaseEstimator] = None,
|
||||
) -> None:
|
||||
super().__init__(random_state)
|
||||
self._base_model = base_model
|
||||
|
||||
def merge_parameters(self, params: dict) -> dict:
|
||||
result = self._parameters.copy()
|
||||
for key, value in params.items():
|
||||
result[f"base_estimator__{key}"] = (
|
||||
value if isinstance(value, list) else [value]
|
||||
)
|
||||
return result
|
||||
|
||||
def get_parameters(self) -> List[dict]:
|
||||
result = []
|
||||
for base_group in self._base_model.get_parameters():
|
||||
result.append(self.merge_parameters(base_group))
|
||||
return result
|
||||
|
||||
|
||||
class ModelAdaBoost(Ensemble):
|
||||
def __init__(
|
||||
self, random_state: int, base_model: BaseEstimator = ModelStree
|
||||
):
|
||||
# Build base_model
|
||||
super().__init__(
|
||||
random_state, base_model=base_model(random_state=random_state)
|
||||
)
|
||||
self._clf = AdaBoostClassifier(
|
||||
base_estimator=self._base_model.get_model(),
|
||||
random_state=random_state,
|
||||
)
|
||||
self._model_name = f"Adaboost_{self._base_model.__class__.__name__}"
|
||||
|
||||
def get_parameters(self) -> List[dict]:
|
||||
self._parameters = {"n_estimators": [50], "algorithm": ["SAMME"]}
|
||||
return super().get_parameters()
|
||||
|
||||
|
||||
class ModelBagging(Ensemble):
|
||||
def __init__(
|
||||
self, random_state: int, base_model: BaseEstimator = ModelStree
|
||||
) -> None:
|
||||
super().__init__(random_state, base_model=base_model(random_state))
|
||||
self._clf = BaggingClassifier(
|
||||
base_estimator=self._base_model.get_model(),
|
||||
random_state=random_state,
|
||||
)
|
||||
self._model_name = f"Bagging_{self._base_model.__class__.__name__}"
|
||||
|
||||
def get_parameters(self) -> List[dict]:
|
||||
self._parameters = {
|
||||
"max_samples": [0.2, 0.4, 0.8, 1.0],
|
||||
"n_estimators": [50, 100],
|
||||
"max_features": [0.2, 0.6],
|
||||
"n_jobs": [-1],
|
||||
}
|
||||
return super().get_parameters()
|
||||
|
||||
|
||||
class ModelOdte(Ensemble):
|
||||
def __init__(self, random_state: int, base_model=ModelStree) -> None:
|
||||
super().__init__(random_state, base_model=base_model(random_state))
|
||||
self._clf = Odte(
|
||||
random_state=random_state,
|
||||
)
|
||||
self._model_name = f"Odte_{self._base_model.__class__.__name__}"
|
||||
|
||||
def get_parameters(self) -> List[dict]:
|
||||
self._parameters = {
|
||||
"max_samples": [0.2, 0.4, 0.8, 1.0],
|
||||
"n_estimators": [50, 100],
|
||||
"max_features": [0.2, 0.6, 1.0],
|
||||
}
|
||||
return super().get_parameters()
|
293
experimentation/Sets.py
Normal file
293
experimentation/Sets.py
Normal file
@@ -0,0 +1,293 @@
|
||||
from __future__ import annotations
|
||||
import os
|
||||
from typing import Tuple, List
|
||||
import numpy as np # type: ignore
|
||||
import pandas as pd # type: ignore
|
||||
from .Utils import TextColor
|
||||
|
||||
tsets = List[Tuple[str, str, str, Tuple[int, int]]]
|
||||
tdataset = Tuple[np.array, np.array]
|
||||
|
||||
|
||||
class Diterator:
|
||||
def __init__(self, data: tsets):
|
||||
self._stack: tsets = data.copy()
|
||||
|
||||
def __next__(self):
|
||||
if len(self._stack) == 0:
|
||||
raise StopIteration()
|
||||
return self._stack.pop(0)
|
||||
|
||||
|
||||
class Dataset_Base:
|
||||
def __init__(self, normalize: bool, standardize: bool) -> None:
|
||||
self._data_folder = "data"
|
||||
self._normalize = normalize
|
||||
self._standardize = standardize
|
||||
|
||||
def load(self, name: str) -> tdataset:
|
||||
"""Datasets have to implement this
|
||||
|
||||
:param name: dataset name
|
||||
:type name: str
|
||||
:return: X, y np.arrays
|
||||
:rtype: tdataset
|
||||
"""
|
||||
pass
|
||||
|
||||
def normalize(self, data: np.array) -> np.array:
|
||||
min_data = data.min()
|
||||
return (data - min_data) / (data.max() - min_data)
|
||||
|
||||
def standardize(self, data: np.array) -> np.array:
|
||||
return (data - data.mean()) / data.std()
|
||||
|
||||
def get_params(self) -> str:
|
||||
return f"normalize={self._normalize}, standardize={self._standardize}"
|
||||
|
||||
def post_process(self, X: np.array, y: np.array) -> tdataset:
|
||||
if self._standardize and self._normalize:
|
||||
X = self.standardize(self.normalize(X))
|
||||
elif self._standardize:
|
||||
X = self.standardize(X)
|
||||
elif self._normalize:
|
||||
X = self.normalize(X)
|
||||
return X, y
|
||||
|
||||
def __iter__(self) -> Diterator:
|
||||
return Diterator(self.data_sets)
|
||||
|
||||
def __str__(self) -> str:
|
||||
out = ""
|
||||
for dataset in self.data_sets:
|
||||
out += f" {dataset[0]},"
|
||||
return out
|
||||
|
||||
def report(self) -> None:
|
||||
print(
|
||||
TextColor.HEADER
|
||||
+ "(#) Dataset Samples Feat. #Cl. y typ "
|
||||
+ "X type f_type"
|
||||
)
|
||||
print(
|
||||
"=== ============================= ======== ===== ==== ===== "
|
||||
"======= ======" + TextColor.ENDC
|
||||
)
|
||||
color = TextColor.LINE2
|
||||
for number, dataset in enumerate(self.data_sets):
|
||||
X, y = self.load(dataset[0]) # type: ignore
|
||||
samples, features = X.shape
|
||||
classes = len(np.unique(y))
|
||||
color = (
|
||||
TextColor.LINE1
|
||||
if color == TextColor.LINE2
|
||||
else TextColor.LINE2
|
||||
)
|
||||
print(
|
||||
color + f"#{number + 1:02d} {dataset[0]:30s} {samples:7,d} "
|
||||
f"{features:5d} {classes:4d} {y.dtype} {str(X.dtype):7s} "
|
||||
f"{dataset[1]}" + TextColor.ENDC
|
||||
)
|
||||
# Check dataset is Ok
|
||||
# data type
|
||||
if str(X.dtype) != dataset[2]:
|
||||
raise ValueError(
|
||||
f"dataset {dataset[0]} has wrong data type. "
|
||||
f"It shoud have {dataset[2]} but has {X.dtype}"
|
||||
)
|
||||
# dimensions
|
||||
if X.shape != dataset[3]:
|
||||
raise ValueError(
|
||||
f"dataset {dataset[0]} has wrong X shape. "
|
||||
f"It shoud have {dataset[3]} but has {X.shape}"
|
||||
)
|
||||
if y.shape != (X.shape[0],):
|
||||
raise ValueError(
|
||||
f"dataset {dataset[0]} has wrong y shape. "
|
||||
f"It shoud have {(X.shape[0],)} but has {y.shape}"
|
||||
)
|
||||
|
||||
print(
|
||||
TextColor.SUCCESS
|
||||
+ "* All Data types and shapes are Ok."
|
||||
+ TextColor.ENDC
|
||||
)
|
||||
|
||||
|
||||
class Datasets_Tanveer(Dataset_Base):
|
||||
def __init__(
|
||||
self, normalize: bool = False, standardize: bool = False
|
||||
) -> None:
|
||||
super().__init__(normalize, standardize)
|
||||
self._folder = os.path.join(self._data_folder, "tanveer")
|
||||
self.data_sets: tsets = [
|
||||
# (name), (filetype), (sampes type)
|
||||
("balance-scale", "Rdat", "float64", (625, 4)),
|
||||
("balloons", "Rdat", "float64", (16, 4)),
|
||||
("breast-cancer-wisc-diag", "Rdat", "float64", (569, 30)),
|
||||
("breast-cancer-wisc-prog", "Rdat", "float64", (198, 33)),
|
||||
("breast-cancer-wisc", "Rdat", "float64", (699, 9)),
|
||||
("breast-cancer", "Rdat", "float64", (286, 9)),
|
||||
("cardiotocography-10clases", "Rdat", "float64", (2126, 21)),
|
||||
("cardiotocography-3clases", "Rdat", "float64", (2126, 21)),
|
||||
("conn-bench-sonar-mines-rocks", "Rdat", "float64", (208, 60)),
|
||||
("cylinder-bands", "Rdat", "float64", (512, 35)),
|
||||
("dermatology", "Rdat", "float64", (366, 34)),
|
||||
("echocardiogram", "Rdat", "float64", (131, 10)),
|
||||
("fertility", "Rdat", "float64", (100, 9)),
|
||||
("haberman-survival", "Rdat", "float64", (306, 3)),
|
||||
("heart-hungarian", "Rdat", "float64", (294, 12)),
|
||||
("hepatitis", "Rdat", "float64", (155, 19)),
|
||||
("ilpd-indian-liver", "Rdat", "float64", (583, 9)),
|
||||
("ionosphere", "Rdat", "float64", (351, 33)),
|
||||
("iris", "Rdat", "float64", (150, 4)),
|
||||
("led-display", "Rdat", "float64", (1000, 7)),
|
||||
("libras", "Rdat", "float64", (360, 90)),
|
||||
("low-res-spect", "Rdat", "float64", (531, 100)),
|
||||
("lymphography", "Rdat", "float64", (148, 18)),
|
||||
("mammographic", "Rdat", "float64", (961, 5)),
|
||||
("molec-biol-promoter", "Rdat", "float64", (106, 57)),
|
||||
("musk-1", "Rdat", "float64", (476, 166)),
|
||||
("oocytes_merluccius_nucleus_4d", "Rdat", "float64", (1022, 41)),
|
||||
("oocytes_merluccius_states_2f", "Rdat", "float64", (1022, 25)),
|
||||
("oocytes_trisopterus_nucleus_2f", "Rdat", "float64", (912, 25)),
|
||||
("oocytes_trisopterus_states_5b", "Rdat", "float64", (912, 32)),
|
||||
("parkinsons", "Rdat", "float64", (195, 22)),
|
||||
("pima", "Rdat", "float64", (768, 8)),
|
||||
("pittsburg-bridges-MATERIAL", "Rdat", "float64", (106, 7)),
|
||||
("pittsburg-bridges-REL-L", "Rdat", "float64", (103, 7)),
|
||||
("pittsburg-bridges-SPAN", "Rdat", "float64", (92, 7)),
|
||||
("pittsburg-bridges-T-OR-D", "Rdat", "float64", (102, 7)),
|
||||
("planning", "Rdat", "float64", (182, 12)),
|
||||
("post-operative", "Rdat", "float64", (90, 8)),
|
||||
("seeds", "Rdat", "float64", (210, 7)),
|
||||
("statlog-australian-credit", "Rdat", "float64", (690, 14)),
|
||||
("statlog-german-credit", "Rdat", "float64", (1000, 24)),
|
||||
("statlog-heart", "Rdat", "float64", (270, 13)),
|
||||
("statlog-image", "Rdat", "float64", (2310, 18)),
|
||||
("statlog-vehicle", "Rdat", "float64", (846, 18)),
|
||||
("synthetic-control", "Rdat", "float64", (600, 60)),
|
||||
("tic-tac-toe", "Rdat", "float64", (958, 9)),
|
||||
("vertebral-column-2clases", "Rdat", "float64", (310, 6)),
|
||||
("wine", "Rdat", "float64", (178, 13)),
|
||||
("zoo", "Rdat", "float64", (101, 16)),
|
||||
]
|
||||
|
||||
def load(self, name: str) -> tdataset:
|
||||
data = pd.read_csv(
|
||||
os.path.join(self._folder, name, f"{name}_R.dat"),
|
||||
sep="\t",
|
||||
index_col=0,
|
||||
)
|
||||
X = data.drop("clase", axis=1).to_numpy()
|
||||
y = data["clase"].to_numpy()
|
||||
return X, y
|
||||
|
||||
|
||||
class Datasets_AAAI(Dataset_Base):
|
||||
def __init__(
|
||||
self, normalize: bool = False, standardize: bool = False
|
||||
) -> None:
|
||||
super().__init__(normalize, standardize)
|
||||
self._folder: str = os.path.join(self._data_folder, "aaai")
|
||||
self.data_sets: tsets = [
|
||||
# (name), (filetype), (sampes type)
|
||||
("breast", "csv", "int16", (683, 9)),
|
||||
("cardiotoc", "csv", "int16", (2126, 41)),
|
||||
("cod-rna", "sparse", "float16", (331152, 8)),
|
||||
("connect4", "sparse", "int16", (67557, 126)),
|
||||
("covtype", "npz", "int16", (581012, 54)),
|
||||
("diabetes", "csv", "float16", (768, 8)),
|
||||
("dna", "csv", "float16", (3186, 180)),
|
||||
("fourclass", "sparse", "int16", (862, 2)),
|
||||
("glass", "csv", "float16", (214, 9)),
|
||||
("heart", "csv", "float16", (270, 13)),
|
||||
("ijcnn1", "sparse", "float16", (141691, 22)),
|
||||
("iris", "csv", "float16", (150, 4)),
|
||||
("letter", "npz", "int16", (20000, 16)),
|
||||
("mnist", "npy", "int16", (70000, 784)),
|
||||
("pendigits", "npy", "int16", (10992, 16)),
|
||||
("protein", "sparse", "float16", (24387, 357)),
|
||||
("satimage", "npy", "int16", (6435, 36)),
|
||||
("segment", "sparse", "float16", (2310, 19)),
|
||||
("shuttle", "npy", "int16", (58000, 9)),
|
||||
("usps", "npz", "float16", (9298, 256)),
|
||||
("vehicle", "sparse", "float16", (846, 18)),
|
||||
("wine", "csv", "float16", (178, 13)),
|
||||
]
|
||||
|
||||
def load_dataset(
|
||||
self, name: str, file_type: str, data_type: str
|
||||
) -> tdataset:
|
||||
return getattr(self, f"load_{file_type}_dataset")(name, data_type)
|
||||
|
||||
def load(self, name: str) -> tdataset:
|
||||
for dataset in self.data_sets:
|
||||
if name == dataset[0]:
|
||||
return self.post_process(
|
||||
*self.load_dataset(*dataset[:3]) # type: ignore
|
||||
)
|
||||
raise ValueError(
|
||||
f"{name} is not a valid dataset, has to be one of {str(self)}"
|
||||
)
|
||||
|
||||
def load_csv_dataset(self, name: str, dtype: str) -> tdataset:
|
||||
data = np.genfromtxt(
|
||||
os.path.join(self._folder, f"{name}.csv"),
|
||||
delimiter=",",
|
||||
dtype=dtype,
|
||||
)
|
||||
features = data.shape[1]
|
||||
return data[:, : features - 1], data[:, -1].astype(np.int16)
|
||||
|
||||
def load_npy_dataset(self, name: str, _: str) -> tdataset:
|
||||
data = np.load(os.path.join(self._folder, f"{name}.npy"))
|
||||
features = data.shape[1]
|
||||
return data[:, : features - 1], data[:, -1].astype(np.int16)
|
||||
|
||||
def load_npz_dataset(self, name, _):
|
||||
data = np.load(os.path.join(self._folder, f"{name}.npz"))
|
||||
return data["arr_0"], data["arr_1"]
|
||||
|
||||
def load_sparse_dataset(self, name: str, _: str) -> tdataset:
|
||||
X, y = np.load(
|
||||
os.path.join(self._folder, f"{name}.npy"), allow_pickle=True
|
||||
)
|
||||
if str(X.dtype) == "float16":
|
||||
# can't do todense with np.float16
|
||||
XX = X.astype(np.float64).todense().astype(np.float16)
|
||||
else:
|
||||
XX = X.todense()
|
||||
return XX, y
|
||||
|
||||
|
||||
class Datasets:
|
||||
def __init__(
|
||||
self,
|
||||
normalize: bool = False,
|
||||
standardize: bool = False,
|
||||
set_of_files: str = "aaai",
|
||||
) -> None:
|
||||
self._model = (
|
||||
Datasets_AAAI(normalize, standardize)
|
||||
if set_of_files == "aaai"
|
||||
else Datasets_Tanveer(normalize, standardize)
|
||||
)
|
||||
|
||||
def load(self, name: str) -> tdataset:
|
||||
return self._model.load(name)
|
||||
|
||||
def post_process(self, X: np.array, y: np.array) -> tdataset:
|
||||
return self._model.post_process(X, y)
|
||||
|
||||
def report(self) -> None:
|
||||
return self._model.report()
|
||||
|
||||
def get_params(self) -> str:
|
||||
return self._model.get_params()
|
||||
|
||||
def __iter__(self) -> Diterator:
|
||||
return Diterator(self._model.data_sets)
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self._model.__str__()
|
32
experimentation/Utils.py
Normal file
32
experimentation/Utils.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import os
|
||||
import mysql.connector
|
||||
|
||||
|
||||
class TextColor:
|
||||
BLUE = "\033[94m"
|
||||
CYAN = "\033[96m"
|
||||
GREEN = "\033[92m"
|
||||
MAGENTA = "\033[95m"
|
||||
YELLOW = "\033[93m"
|
||||
RED = "\033[91m"
|
||||
HEADER = MAGENTA
|
||||
LINE1 = BLUE
|
||||
LINE2 = CYAN
|
||||
SUCCESS = GREEN
|
||||
WARNING = YELLOW
|
||||
FAIL = RED
|
||||
ENDC = "\033[0m"
|
||||
BOLD = "\033[1m"
|
||||
UNDERLINE = "\033[4m"
|
||||
|
||||
|
||||
class MySQL:
|
||||
@staticmethod
|
||||
def get_connection():
|
||||
config = dict()
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
with open(os.path.join(dir_path, ".myconfig")) as f:
|
||||
for line in f.read().splitlines():
|
||||
key, value = line.split("=")
|
||||
config[key] = value
|
||||
return mysql.connector.connect(**config)
|
15
experimentation/__init__.py
Normal file
15
experimentation/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from .Sets import Datasets
|
||||
from .Database import Outcomes, Hyperparameters
|
||||
from .Experiments import Experiment
|
||||
from .Models import ModelStree, ModelBagging, ModelAdaBoost, ModelOdte
|
||||
|
||||
__all__ = [
|
||||
"Datasets",
|
||||
"Outcomes",
|
||||
"Hyperparameters",
|
||||
"Experiment",
|
||||
"ModelStree",
|
||||
"ModelAdaBoost",
|
||||
"ModelBagging",
|
||||
"ModelOdte",
|
||||
]
|
Reference in New Issue
Block a user