Commit Inicial

This commit is contained in:
2020-11-20 11:23:40 +01:00
commit 5611e5bc01
2914 changed files with 2625178 additions and 0 deletions

396
experimentation/Database.py Normal file
View File

@@ -0,0 +1,396 @@
import os
import sqlite3
from datetime import datetime
from abc import ABC
from typing import List
from .Models import ModelBase
from .Utils import TextColor, MySQL
class BD(ABC):
_folder = "./data/results"
_con = None
def __init__(self, host: str, model: ModelBase) -> None:
self._model = model
self._host = host
self._database = os.path.join(
self._folder, f"{host}_{model}_experiments.sqlite3"
)
self._con = sqlite3.connect(self._database)
# return dict as a result of select
self._con.row_factory = sqlite3.Row
self._check_build()
# accumulators used in reports
self._best = self._worse = self._equal = 0
def _check_build(self) -> None:
"""Check if the tables are created and create them if they don't"""
commands = [
'create table if not exists "outcomes" ("dataset" varchar NOT NULL'
',"date" datetime NOT NULL DEFAULT NULL,"fit_time" num NOT NULL '
'DEFAULT NULL, "fit_time_std" num, "score_time" num NOT NULL '
'DEFAULT NULL, "score_time_std" num, "train_score" num NOT NULL '
'DEFAULT NULL, "train_score_std" num, "test_score" num NOT NULL '
'DEFAULT NULL, "test_score_std" num, "parameters" text DEFAULT '
'NULL, "normalize" int NOT NULL DEFAULT 0, "standardize" int NOT '
"NULL DEFAULT 0, PRIMARY KEY (dataset, date));",
'create table if not exists "hyperparameters" ("dataset" varchar '
'NOT NULL,"date" datetime NOT NULL DEFAULT NULL,"fit_time" num NOT'
' NULL DEFAULT NULL, "fit_time_std" num, "score_time" num NOT NULL'
' DEFAULT NULL, "score_time_std" num, "train_score" num NOT NULL '
'DEFAULT NULL, "train_score_std" num, "test_score" num NOT NULL '
'DEFAULT NULL, "test_score_std" num, "parameters" text DEFAULT '
'NULL, "normalize" int NOT NULL DEFAULT 0, "standardize" int NOT '
"NULL DEFAULT 0, PRIMARY KEY (dataset, normalize, standardize));",
'create table if not exists "reference" ("dataset" varchar NOT '
'NULL,"score" num NOT NULL, PRIMARY KEY (dataset));',
"INSERT or replace INTO reference (dataset, score) VALUES "
"('balance-scale', '0.904628'),"
"('balloons', '0.6625'),"
"('breast-cancer-wisc-diag', '0.974345'),"
"('breast-cancer-wisc-prog', '0.79934'),"
"('breast-cancer-wisc', '0.970256'),"
"('breast-cancer', '0.73824'),"
"('cardiotocography-10clases', '0.827761'),"
"('cardiotocography-3clases', '0.920134'),"
"('conn-bench-sonar-mines-rocks', '0.833654'),"
"('cylinder-bands', '0.769141'),"
"('dermatology', '0.973278'),"
"('echocardiogram', '0.848527'),"
"('fertility', '0.884'),"
"('haberman-survival', '0.739254'),"
"('heart-hungarian', '0.820475'),"
"('hepatitis', '0.823203'),"
"('ilpd-indian-liver', '0.715028'),"
"('ionosphere', '0.944215'),"
"('iris', '0.978656'),"
"('led-display', '0.7102'),"
"('libras', '0.891111'),"
"('low-res-spect', '0.90282'),"
"('lymphography', '0.855405'),"
"('mammographic', '0.827472'),"
"('molec-biol-promoter', '0.818269'),"
"('musk-1', '0.876471'),"
"('oocytes_merluccius_nucleus_4d', '0.839963'),"
"('oocytes_merluccius_states_2f', '0.929963'),"
"('oocytes_trisopterus_nucleus_2f', '0.833333'),"
"('oocytes_trisopterus_states_5b', '0.931579'),"
"('parkinsons', '0.920221'),"
"('pima', '0.767188'),"
"('pittsburg-bridges-MATERIAL', '0.864286'),"
"('pittsburg-bridges-REL-L', '0.695929'),"
"('pittsburg-bridges-SPAN', '0.68913'),"
"('pittsburg-bridges-T-OR-D', '0.87437'),"
"('planning', '0.725579'),"
"('post-operative', '0.711742'),"
"('seeds', '0.956303'),"
"('statlog-australian-credit', '0.678281'),"
"('statlog-german-credit', '0.7562'),"
"('statlog-heart', '0.842299'),"
"('statlog-image', '0.976194'),"
"('statlog-vehicle', '0.800673'),"
"('synthetic-control', '0.990333'),"
"('tic-tac-toe', '0.985385'),"
"('vertebral-column-2clases', '0.849153'),"
"('wine', '0.993281'),"
"('zoo', '0.960385')",
]
for command in commands:
self.execute(command)
def mirror(
self, exp_type, dataset, normalize, standardize, accuracy, parameters
) -> None:
"""Create a record in MySQL database
:param record: data to insert in database
:type record: dict
"""
database = MySQL.get_connection()
command_insert = (
"replace into results (date, time, type, accuracy, "
"dataset, classifier, norm, stand, parameters) values (%s, %s, "
"%s, %s, %s, %s, %s, %s, %s)"
)
now = datetime.now()
date = now.strftime("%Y-%m-%d")
time = now.strftime("%H:%M:%S")
values = (
date,
time,
exp_type,
accuracy,
dataset,
self._model,
normalize,
standardize,
parameters,
)
cursor = database.cursor()
cursor.execute(command_insert, values)
database.commit()
def execute(self, command: str) -> None:
c = self._con.cursor()
c.execute(command)
c.close()
self._con.commit()
def header(
self,
title: str,
lengths: List[int],
fields: List[str],
exclude_params,
) -> str:
length = 148 if exclude_params else 170
title += f" -- {self._model} in {self._host} --"
output = "\n" + "*" * length + "\n"
num = (length - len(title) - 2) // 2
num2 = length - len(title) - 2 - 2 * num
output += "*" + " " * num + title + " " * (num + num2) + "*\n"
output += "*" * length + "\n\n"
for field, length in zip(fields, lengths):
output += ("{0:" + str(length) + "} ").format(field)
output += "\n"
for length in lengths:
output += "=" * length + " "
return output
def check_result(self, test, reference) -> str:
if test > reference:
self._best += 1
result = "+"
elif test < reference:
self._worse += 1
result = "-"
else:
self._equal += 1
result = "="
return result
def report_line(self, data, exclude_params):
data = list(data)
dataset = data.pop(0)
reference = data.pop()
_ = data.pop() # remove dataset name of inner join
exec_date = data.pop(0)
standardize = data.pop()
normalize = data.pop()
parameters = data.pop()
result = self.check_result(data[6], reference)
if exclude_params:
parameters = ""
output = ""
index = 0
for item in data:
if index % 2:
fact = f" (+/- {item * 2:6.2f}) "
else:
if index > 3:
fact = f"{item:7.4f}"
else:
fact = f"{item:7.2f}"
output += fact
index += 1
return (
f"{dataset:30s} {exec_date:10s} {normalize} {standardize} {output}"
f"{reference:1.5f} {result} {parameters}"
)
def report_header(self, title, exclude_params):
lengths = [30, 19, 1, 1, 20, 20, 20, 20, 9, 21]
fields = [
"Dataset",
"Date",
"N",
"S",
"Fit Time (sec)",
"Score Time (sec)",
"Score on Train",
"Score on Test",
"Reference",
"Parameters",
]
if exclude_params:
fields.pop()
lengths.pop()
return self.header(title, lengths, fields, exclude_params)
def report_footer(self):
print(
TextColor.GREEN
+ f"{self._model} has better results {self._best:2d} times"
)
print(
TextColor.RED
+ f"{self._model} has worse results {self._worse:2d} times"
)
print(
TextColor.MAGENTA
+ f"{self._model} has equal results {self._equal:2d} times"
)
class Outcomes(BD):
def __init__(self, host: str, model):
self._table = "outcomes"
super().__init__(host=host, model=model)
def store(self, dataset, normalize, standardize, parameters, results):
outcomes = ["fit_time", "score_time", "train_score", "test_score"]
data = ""
for index in outcomes:
data += ", " + str(results[index].mean()) + ", "
data += str(results[index].std())
command = (
f"insert or replace into {self._table} ('dataset', 'parameters', "
"'date', 'normalize', 'standardize'"
)
for field in outcomes:
command += f",'{field}', '{field}_std'"
command += f") values('{dataset}', '{parameters}', DateTime('now', "
command += f"'localtime'), '{int(normalize)}', '{int(standardize)}'"
command += data + ")"
command = command.replace("nan", "null")
self.execute(command)
self.mirror(
"crossval",
dataset,
normalize,
standardize,
float(results["test_score"].mean()),
parameters,
)
def report(self, dataset, exclude_params):
cursor = self._con.cursor()
suffix = "" if dataset == "all" else f"WHERE dataset='{dataset}'"
cursor.execute(
f"SELECT * FROM {self._table} o {suffix} inner join reference r on"
" o.dataset=r.dataset order by dataset, date desc;"
)
records = cursor.fetchall()
num_records = len(records)
title = f"5 Folds Cross Validation: {dataset} - {num_records} records"
print(
TextColor.HEADER
+ self.report_header(title, exclude_params)
+ TextColor.ENDC
)
color = TextColor.LINE2
for record in records:
color = (
TextColor.LINE1
if color == TextColor.LINE2
else TextColor.LINE2
)
print(
color
+ self.report_line(record, exclude_params)
+ TextColor.ENDC
)
if records == []:
print(
TextColor.WARNING
+ " No records yet"
+ TextColor.ENDC
)
else:
self.report_footer()
cursor.close()
class Hyperparameters(BD):
def __init__(self, host: str, model):
self._table = "hyperparameters"
super().__init__(host=host, model=model)
def store(
self,
dataset,
time,
grid,
parameters,
normalize,
standardize,
grid_type,
):
rosetta = [
("mean_fit_time", "fit_time"),
("std_fit_time", "fit_time_std"),
("mean_score_time", "score_time"),
("std_score_time", "score_time_std"),
("mean_test_score", "test_score"),
("std_test_score", "test_score_std"),
("mean_train_score", "train_score"),
("std_train_score", "train_score_std"),
("params", "parameters"),
]
# load outcomes vector
outcomes = {}
for item, bd_item in rosetta:
outcomes[bd_item] = grid.cv_results_[item][grid.best_index_]
outcomes["parameters"] = parameters
outcomes["normalize"] = int(normalize)
outcomes["standardize"] = int(standardize)
rosetta.append(("_", "normalize"))
rosetta.append(("_", "standardize"))
command = f"insert or replace into {self._table} ('dataset', 'date'"
command_values = f"values ('{dataset}', DateTime('now', 'localtime')"
for _, item in rosetta:
command += f", '{item}'"
command_values += (
f", {outcomes[item]}"
if item != "parameters"
else f", '{outcomes[item]}'"
)
command += ") "
command_values += ")"
self.execute(command + command_values)
accuracy = float(outcomes["test_score"])
self.mirror(
grid_type, dataset, normalize, standardize, accuracy, parameters
)
def report(self, dataset, exclude_params):
cursor = self._con.cursor()
cursor.execute(
f"SELECT * FROM {self._table} h inner join reference r on "
"r.dataset=h.dataset order by dataset, date desc;"
)
records = cursor.fetchall()
num_records = len(records)
title = f"Grid Searches done so far - {num_records} records"
print(
TextColor.HEADER
+ self.report_header(title, exclude_params)
+ TextColor.ENDC
)
color = TextColor.LINE2
for record in records:
color = (
TextColor.LINE1
if color == TextColor.LINE2
else TextColor.LINE2
)
print(
color
+ self.report_line(record, exclude_params)
+ TextColor.ENDC
)
cursor.close()
self.report_footer()
def get_params(self, dataset):
cursor = self._con.cursor()
cursor.execute(
f"SELECT parameters, normalize, standardize FROM {self._table} "
f"where dataset='{dataset}' order by test_score desc;"
)
record = cursor.fetchone()
if record is None:
raise ValueError(f"parameters not found for dataset {dataset}")
return record["parameters"], record["normalize"], record["standardize"]

View File

@@ -0,0 +1,138 @@
import json
import os
import time
import warnings
from sklearn.model_selection import GridSearchCV, cross_validate
from . import Models
from .Database import Hyperparameters, Outcomes
from .Sets import Datasets
class Experiment:
def __init__(
self, random_state: int, model: str, host: str, set_of_files: str
) -> None:
self._random_state = random_state
self._model_name = model
self._set_of_files = set_of_files
self._type = getattr(
Models,
f"Model{model[0].upper() + model[1:]}",
)
self._clf = self._type(random_state=self._random_state)
self._host = host
# used in gridsearch with ensembles to take best hyperparams of
# base class or gridsearch these hyperparams as well
self._base_params = "any"
def set_base_params(self, base_params: str) -> None:
self._base_params = base_params
def cross_validation(self, dataset: str) -> None:
hyperparams = Hyperparameters(host=self._host, model=self._model_name)
try:
parameters, normalize, standardize = hyperparams.get_params(
dataset
)
except ValueError:
print(f"*** {dataset} not trained")
return
datasets = Datasets(
normalize=normalize,
standardize=standardize,
set_of_files=self._set_of_files,
)
parameters = json.loads(parameters)
X, y = datasets.load(dataset)
# init cross validation object just in case consecutive experiments
self._clf = self._type(random_state=self._random_state)
model = self._clf.get_model().set_params(**parameters)
self._num_warnings = 0
warnings.warn = self._warn
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
# Also affect subprocesses
os.environ["PYTHONWARNINGS"] = "ignore"
results = cross_validate(
model, X, y, return_train_score=True, n_jobs=-1
)
outcomes = Outcomes(host=self._host, model=self._model_name)
parameters = json.dumps(parameters, sort_keys=True)
outcomes.store(dataset, normalize, standardize, parameters, results)
if self._num_warnings > 0:
print(f"{self._num_warnings} warnings have happend")
def grid_search(
self, dataset: str, normalize: bool, standardize: bool
) -> None:
"""First of all if the modle is an ensemble search for the best
hyperparams found in gridsearch for base model and overrides
normalize and standardize
"""
hyperparams = Hyperparameters(host=self._host, model=self._model_name)
model = self._clf.get_model()
hyperparameters = self._clf.get_parameters()
grid_type = "gridsearch"
if (
isinstance(self._clf, Models.Ensemble)
and self._base_params == "best"
):
hyperparams_base = Hyperparameters(
host=self._host, model=self._clf._base_model.get_model_name()
)
try:
# Get best hyperparameters obtained in gridsearch for base clf
(
base_hyperparams,
normalize,
standardize,
) = hyperparams_base.get_params(dataset)
# Merge hyperparameters with the ensemble ones
base_hyperparams = json.loads(base_hyperparams)
hyperparameters = self._clf.merge_parameters(base_hyperparams)
grid_type = "gridbest"
except ValueError:
pass
dt = Datasets(
normalize=normalize,
standardize=standardize,
set_of_files=self._set_of_files,
)
X, y = dt.load(dataset)
self._num_warnings = 0
warnings.warn = self._warn
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
# Also affect subprocesses
os.environ["PYTHONWARNINGS"] = "ignore"
grid_search = GridSearchCV(
model,
return_train_score=True,
param_grid=hyperparameters,
n_jobs=-1,
)
start_time = time.time()
grid_search.fit(X, y)
time_spent = time.time() - start_time
parameters = json.dumps(
self._clf.modified_parameters(
grid_search.best_estimator_.get_params()
),
sort_keys=True,
)
hyperparams.store(
dataset,
time_spent,
grid_search,
parameters,
normalize,
standardize,
grid_type,
)
if self._num_warnings > 0:
print(f"{self._num_warnings} warnings have happend")
def _warn(self, *args, **kwargs) -> None:
self._num_warnings += 1

188
experimentation/Models.py Normal file
View File

@@ -0,0 +1,188 @@
from stree import Stree
from typing import Union, Optional, List
from abc import ABC
from sklearn.ensemble import (
AdaBoostClassifier, # type: ignore
BaggingClassifier, # type: ignore
)
from sklearn.ensemble import BaseEnsemble # type: ignore
from sklearn.base import BaseEstimator # type: ignore
from sklearn.svm import LinearSVC # type: ignore
from sklearn.tree import DecisionTreeClassifier # type: ignore
from odte import Odte
class ModelBase(ABC):
def __init__(self, random_state: Optional[int]):
self._random_state = random_state
def get_model_name(self) -> str:
return self._model_name
def get_model(self) -> Union[BaseEnsemble, BaseEstimator]:
return self._clf
def get_parameters(self) -> dict:
return self._param_grid
def modified_parameters(self, optimum_parameters) -> dict:
result = dict()
# useful for ensembles
excluded = ["base_estimator"]
default_parameters = type(self._clf)().get_params()
for key, data in optimum_parameters.items():
if (
key not in default_parameters
or default_parameters[key] != data
) and key not in excluded:
result[key] = data
return result
class ModelStree(ModelBase):
def __init__(self, random_state: Optional[int] = None) -> None:
self._clf = Stree()
super().__init__(random_state)
self._model_name = "stree"
C = [0.05, 0.2, 0.55, 7, 55, 1e4]
max_iter = [1e4, 1e5, 1e6]
gamma = [1e-1, 1, 1e1]
max_features = [None, "auto"]
split_criteria = ["impurity", "max_samples"]
self._param_grid = [
{
"random_state": [self._random_state],
"C": C,
"max_iter": max_iter,
"split_criteria": split_criteria,
"max_features": max_features,
},
{
"random_state": [self._random_state],
"kernel": ["rbf"],
"C": C,
"gamma": gamma,
"max_iter": max_iter,
"split_criteria": split_criteria,
"max_features": max_features,
},
{
"random_state": [self._random_state],
"kernel": ["poly"],
"degree": [3, 5],
"C": C,
"gamma": gamma,
"max_iter": max_iter,
"split_criteria": split_criteria,
"max_features": max_features,
},
]
class ModelSVC(ModelBase):
def __init__(self, random_state: Optional[int] = None) -> None:
super().__init__(random_state)
self._clf = LinearSVC()
self._model_name = "svc"
max_iter = [1e4, 1e5, 1e6]
self._param_grid = [
{
"random_state": [self._random_state],
"C": [1, 55, 1e4],
"max_iter": max_iter,
},
]
class ModelDecisionTree(ModelBase):
def __init__(self, random_state: Optional[int] = None) -> None:
super().__init__(random_state)
self._clf = DecisionTreeClassifier()
self._model_name = "dtree"
self._param_grid = [
{
"random_state": [self._random_state],
"max_features": [None, "log2", "auto"],
},
]
class Ensemble(ModelBase):
def __init__(
self,
random_state: Optional[int] = 0,
base_model: Union[BaseEnsemble, BaseEstimator] = None,
) -> None:
super().__init__(random_state)
self._base_model = base_model
def merge_parameters(self, params: dict) -> dict:
result = self._parameters.copy()
for key, value in params.items():
result[f"base_estimator__{key}"] = (
value if isinstance(value, list) else [value]
)
return result
def get_parameters(self) -> List[dict]:
result = []
for base_group in self._base_model.get_parameters():
result.append(self.merge_parameters(base_group))
return result
class ModelAdaBoost(Ensemble):
def __init__(
self, random_state: int, base_model: BaseEstimator = ModelStree
):
# Build base_model
super().__init__(
random_state, base_model=base_model(random_state=random_state)
)
self._clf = AdaBoostClassifier(
base_estimator=self._base_model.get_model(),
random_state=random_state,
)
self._model_name = f"Adaboost_{self._base_model.__class__.__name__}"
def get_parameters(self) -> List[dict]:
self._parameters = {"n_estimators": [50], "algorithm": ["SAMME"]}
return super().get_parameters()
class ModelBagging(Ensemble):
def __init__(
self, random_state: int, base_model: BaseEstimator = ModelStree
) -> None:
super().__init__(random_state, base_model=base_model(random_state))
self._clf = BaggingClassifier(
base_estimator=self._base_model.get_model(),
random_state=random_state,
)
self._model_name = f"Bagging_{self._base_model.__class__.__name__}"
def get_parameters(self) -> List[dict]:
self._parameters = {
"max_samples": [0.2, 0.4, 0.8, 1.0],
"n_estimators": [50, 100],
"max_features": [0.2, 0.6],
"n_jobs": [-1],
}
return super().get_parameters()
class ModelOdte(Ensemble):
def __init__(self, random_state: int, base_model=ModelStree) -> None:
super().__init__(random_state, base_model=base_model(random_state))
self._clf = Odte(
random_state=random_state,
)
self._model_name = f"Odte_{self._base_model.__class__.__name__}"
def get_parameters(self) -> List[dict]:
self._parameters = {
"max_samples": [0.2, 0.4, 0.8, 1.0],
"n_estimators": [50, 100],
"max_features": [0.2, 0.6, 1.0],
}
return super().get_parameters()

293
experimentation/Sets.py Normal file
View File

@@ -0,0 +1,293 @@
from __future__ import annotations
import os
from typing import Tuple, List
import numpy as np # type: ignore
import pandas as pd # type: ignore
from .Utils import TextColor
tsets = List[Tuple[str, str, str, Tuple[int, int]]]
tdataset = Tuple[np.array, np.array]
class Diterator:
def __init__(self, data: tsets):
self._stack: tsets = data.copy()
def __next__(self):
if len(self._stack) == 0:
raise StopIteration()
return self._stack.pop(0)
class Dataset_Base:
def __init__(self, normalize: bool, standardize: bool) -> None:
self._data_folder = "data"
self._normalize = normalize
self._standardize = standardize
def load(self, name: str) -> tdataset:
"""Datasets have to implement this
:param name: dataset name
:type name: str
:return: X, y np.arrays
:rtype: tdataset
"""
pass
def normalize(self, data: np.array) -> np.array:
min_data = data.min()
return (data - min_data) / (data.max() - min_data)
def standardize(self, data: np.array) -> np.array:
return (data - data.mean()) / data.std()
def get_params(self) -> str:
return f"normalize={self._normalize}, standardize={self._standardize}"
def post_process(self, X: np.array, y: np.array) -> tdataset:
if self._standardize and self._normalize:
X = self.standardize(self.normalize(X))
elif self._standardize:
X = self.standardize(X)
elif self._normalize:
X = self.normalize(X)
return X, y
def __iter__(self) -> Diterator:
return Diterator(self.data_sets)
def __str__(self) -> str:
out = ""
for dataset in self.data_sets:
out += f" {dataset[0]},"
return out
def report(self) -> None:
print(
TextColor.HEADER
+ "(#) Dataset Samples Feat. #Cl. y typ "
+ "X type f_type"
)
print(
"=== ============================= ======== ===== ==== ===== "
"======= ======" + TextColor.ENDC
)
color = TextColor.LINE2
for number, dataset in enumerate(self.data_sets):
X, y = self.load(dataset[0]) # type: ignore
samples, features = X.shape
classes = len(np.unique(y))
color = (
TextColor.LINE1
if color == TextColor.LINE2
else TextColor.LINE2
)
print(
color + f"#{number + 1:02d} {dataset[0]:30s} {samples:7,d} "
f"{features:5d} {classes:4d} {y.dtype} {str(X.dtype):7s} "
f"{dataset[1]}" + TextColor.ENDC
)
# Check dataset is Ok
# data type
if str(X.dtype) != dataset[2]:
raise ValueError(
f"dataset {dataset[0]} has wrong data type. "
f"It shoud have {dataset[2]} but has {X.dtype}"
)
# dimensions
if X.shape != dataset[3]:
raise ValueError(
f"dataset {dataset[0]} has wrong X shape. "
f"It shoud have {dataset[3]} but has {X.shape}"
)
if y.shape != (X.shape[0],):
raise ValueError(
f"dataset {dataset[0]} has wrong y shape. "
f"It shoud have {(X.shape[0],)} but has {y.shape}"
)
print(
TextColor.SUCCESS
+ "* All Data types and shapes are Ok."
+ TextColor.ENDC
)
class Datasets_Tanveer(Dataset_Base):
def __init__(
self, normalize: bool = False, standardize: bool = False
) -> None:
super().__init__(normalize, standardize)
self._folder = os.path.join(self._data_folder, "tanveer")
self.data_sets: tsets = [
# (name), (filetype), (sampes type)
("balance-scale", "Rdat", "float64", (625, 4)),
("balloons", "Rdat", "float64", (16, 4)),
("breast-cancer-wisc-diag", "Rdat", "float64", (569, 30)),
("breast-cancer-wisc-prog", "Rdat", "float64", (198, 33)),
("breast-cancer-wisc", "Rdat", "float64", (699, 9)),
("breast-cancer", "Rdat", "float64", (286, 9)),
("cardiotocography-10clases", "Rdat", "float64", (2126, 21)),
("cardiotocography-3clases", "Rdat", "float64", (2126, 21)),
("conn-bench-sonar-mines-rocks", "Rdat", "float64", (208, 60)),
("cylinder-bands", "Rdat", "float64", (512, 35)),
("dermatology", "Rdat", "float64", (366, 34)),
("echocardiogram", "Rdat", "float64", (131, 10)),
("fertility", "Rdat", "float64", (100, 9)),
("haberman-survival", "Rdat", "float64", (306, 3)),
("heart-hungarian", "Rdat", "float64", (294, 12)),
("hepatitis", "Rdat", "float64", (155, 19)),
("ilpd-indian-liver", "Rdat", "float64", (583, 9)),
("ionosphere", "Rdat", "float64", (351, 33)),
("iris", "Rdat", "float64", (150, 4)),
("led-display", "Rdat", "float64", (1000, 7)),
("libras", "Rdat", "float64", (360, 90)),
("low-res-spect", "Rdat", "float64", (531, 100)),
("lymphography", "Rdat", "float64", (148, 18)),
("mammographic", "Rdat", "float64", (961, 5)),
("molec-biol-promoter", "Rdat", "float64", (106, 57)),
("musk-1", "Rdat", "float64", (476, 166)),
("oocytes_merluccius_nucleus_4d", "Rdat", "float64", (1022, 41)),
("oocytes_merluccius_states_2f", "Rdat", "float64", (1022, 25)),
("oocytes_trisopterus_nucleus_2f", "Rdat", "float64", (912, 25)),
("oocytes_trisopterus_states_5b", "Rdat", "float64", (912, 32)),
("parkinsons", "Rdat", "float64", (195, 22)),
("pima", "Rdat", "float64", (768, 8)),
("pittsburg-bridges-MATERIAL", "Rdat", "float64", (106, 7)),
("pittsburg-bridges-REL-L", "Rdat", "float64", (103, 7)),
("pittsburg-bridges-SPAN", "Rdat", "float64", (92, 7)),
("pittsburg-bridges-T-OR-D", "Rdat", "float64", (102, 7)),
("planning", "Rdat", "float64", (182, 12)),
("post-operative", "Rdat", "float64", (90, 8)),
("seeds", "Rdat", "float64", (210, 7)),
("statlog-australian-credit", "Rdat", "float64", (690, 14)),
("statlog-german-credit", "Rdat", "float64", (1000, 24)),
("statlog-heart", "Rdat", "float64", (270, 13)),
("statlog-image", "Rdat", "float64", (2310, 18)),
("statlog-vehicle", "Rdat", "float64", (846, 18)),
("synthetic-control", "Rdat", "float64", (600, 60)),
("tic-tac-toe", "Rdat", "float64", (958, 9)),
("vertebral-column-2clases", "Rdat", "float64", (310, 6)),
("wine", "Rdat", "float64", (178, 13)),
("zoo", "Rdat", "float64", (101, 16)),
]
def load(self, name: str) -> tdataset:
data = pd.read_csv(
os.path.join(self._folder, name, f"{name}_R.dat"),
sep="\t",
index_col=0,
)
X = data.drop("clase", axis=1).to_numpy()
y = data["clase"].to_numpy()
return X, y
class Datasets_AAAI(Dataset_Base):
def __init__(
self, normalize: bool = False, standardize: bool = False
) -> None:
super().__init__(normalize, standardize)
self._folder: str = os.path.join(self._data_folder, "aaai")
self.data_sets: tsets = [
# (name), (filetype), (sampes type)
("breast", "csv", "int16", (683, 9)),
("cardiotoc", "csv", "int16", (2126, 41)),
("cod-rna", "sparse", "float16", (331152, 8)),
("connect4", "sparse", "int16", (67557, 126)),
("covtype", "npz", "int16", (581012, 54)),
("diabetes", "csv", "float16", (768, 8)),
("dna", "csv", "float16", (3186, 180)),
("fourclass", "sparse", "int16", (862, 2)),
("glass", "csv", "float16", (214, 9)),
("heart", "csv", "float16", (270, 13)),
("ijcnn1", "sparse", "float16", (141691, 22)),
("iris", "csv", "float16", (150, 4)),
("letter", "npz", "int16", (20000, 16)),
("mnist", "npy", "int16", (70000, 784)),
("pendigits", "npy", "int16", (10992, 16)),
("protein", "sparse", "float16", (24387, 357)),
("satimage", "npy", "int16", (6435, 36)),
("segment", "sparse", "float16", (2310, 19)),
("shuttle", "npy", "int16", (58000, 9)),
("usps", "npz", "float16", (9298, 256)),
("vehicle", "sparse", "float16", (846, 18)),
("wine", "csv", "float16", (178, 13)),
]
def load_dataset(
self, name: str, file_type: str, data_type: str
) -> tdataset:
return getattr(self, f"load_{file_type}_dataset")(name, data_type)
def load(self, name: str) -> tdataset:
for dataset in self.data_sets:
if name == dataset[0]:
return self.post_process(
*self.load_dataset(*dataset[:3]) # type: ignore
)
raise ValueError(
f"{name} is not a valid dataset, has to be one of {str(self)}"
)
def load_csv_dataset(self, name: str, dtype: str) -> tdataset:
data = np.genfromtxt(
os.path.join(self._folder, f"{name}.csv"),
delimiter=",",
dtype=dtype,
)
features = data.shape[1]
return data[:, : features - 1], data[:, -1].astype(np.int16)
def load_npy_dataset(self, name: str, _: str) -> tdataset:
data = np.load(os.path.join(self._folder, f"{name}.npy"))
features = data.shape[1]
return data[:, : features - 1], data[:, -1].astype(np.int16)
def load_npz_dataset(self, name, _):
data = np.load(os.path.join(self._folder, f"{name}.npz"))
return data["arr_0"], data["arr_1"]
def load_sparse_dataset(self, name: str, _: str) -> tdataset:
X, y = np.load(
os.path.join(self._folder, f"{name}.npy"), allow_pickle=True
)
if str(X.dtype) == "float16":
# can't do todense with np.float16
XX = X.astype(np.float64).todense().astype(np.float16)
else:
XX = X.todense()
return XX, y
class Datasets:
def __init__(
self,
normalize: bool = False,
standardize: bool = False,
set_of_files: str = "aaai",
) -> None:
self._model = (
Datasets_AAAI(normalize, standardize)
if set_of_files == "aaai"
else Datasets_Tanveer(normalize, standardize)
)
def load(self, name: str) -> tdataset:
return self._model.load(name)
def post_process(self, X: np.array, y: np.array) -> tdataset:
return self._model.post_process(X, y)
def report(self) -> None:
return self._model.report()
def get_params(self) -> str:
return self._model.get_params()
def __iter__(self) -> Diterator:
return Diterator(self._model.data_sets)
def __str__(self) -> str:
return self._model.__str__()

32
experimentation/Utils.py Normal file
View File

@@ -0,0 +1,32 @@
import os
import mysql.connector
class TextColor:
BLUE = "\033[94m"
CYAN = "\033[96m"
GREEN = "\033[92m"
MAGENTA = "\033[95m"
YELLOW = "\033[93m"
RED = "\033[91m"
HEADER = MAGENTA
LINE1 = BLUE
LINE2 = CYAN
SUCCESS = GREEN
WARNING = YELLOW
FAIL = RED
ENDC = "\033[0m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"
class MySQL:
@staticmethod
def get_connection():
config = dict()
dir_path = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(dir_path, ".myconfig")) as f:
for line in f.read().splitlines():
key, value = line.split("=")
config[key] = value
return mysql.connector.connect(**config)

View File

@@ -0,0 +1,15 @@
from .Sets import Datasets
from .Database import Outcomes, Hyperparameters
from .Experiments import Experiment
from .Models import ModelStree, ModelBagging, ModelAdaBoost, ModelOdte
__all__ = [
"Datasets",
"Outcomes",
"Hyperparameters",
"Experiment",
"ModelStree",
"ModelAdaBoost",
"ModelBagging",
"ModelOdte",
]