mirror of
https://github.com/Doctorado-ML/Stree_datasets.git
synced 2025-08-15 15:36:01 +00:00
310 lines
12 KiB
Python
310 lines
12 KiB
Python
from __future__ import annotations
|
|
import os
|
|
from typing import Tuple, List
|
|
import numpy as np # type: ignore
|
|
import pandas as pd # type: ignore
|
|
from .Utils import TextColor
|
|
|
|
tsets = List[Tuple[str, str, str, Tuple[int, int]]]
|
|
tdataset = Tuple[np.array, np.array]
|
|
|
|
|
|
class Diterator:
|
|
def __init__(self, data: tsets):
|
|
self._stack: tsets = data.copy()
|
|
|
|
def __next__(self):
|
|
if len(self._stack) == 0:
|
|
raise StopIteration()
|
|
return self._stack.pop(0)
|
|
|
|
|
|
class Dataset_Base:
|
|
def __init__(self, normalize: bool, standardize: bool) -> None:
|
|
self._data_folder = "data"
|
|
self._normalize = normalize
|
|
self._standardize = standardize
|
|
|
|
def load(self, name: str) -> tdataset:
|
|
"""Datasets have to implement this
|
|
|
|
:param name: dataset name
|
|
:type name: str
|
|
:return: X, y np.arrays
|
|
:rtype: tdataset
|
|
"""
|
|
pass
|
|
|
|
@staticmethod
|
|
def normalize(data: np.array) -> np.array:
|
|
min_data = data.min()
|
|
return (data - min_data) / (data.max() - min_data)
|
|
|
|
@staticmethod
|
|
def normalize_rows(data: np.array) -> np.array:
|
|
res = data.copy()
|
|
for col in range(res.shape[1]):
|
|
res[:, col] = Dataset_Base.normalize(res[:, col])
|
|
return res
|
|
|
|
@staticmethod
|
|
def standardize(data: np.array) -> np.array:
|
|
return (data - data.mean()) / data.std()
|
|
|
|
@staticmethod
|
|
def standardize_rows(data: np.array) -> np.array:
|
|
res = data.copy()
|
|
for col in range(res.shape[1]):
|
|
res[:, col] = Dataset_Base.standardize(res[:, col])
|
|
return res
|
|
|
|
def get_params(self) -> str:
|
|
return f"normalize={self._normalize}, standardize={self._standardize}"
|
|
|
|
def post_process(self, X: np.array, y: np.array) -> tdataset:
|
|
if self._standardize and self._normalize:
|
|
X = self.standardize_rows(self.normalize_rows(X))
|
|
elif self._standardize:
|
|
X = self.standardize_rows(X)
|
|
elif self._normalize:
|
|
X = self.normalize_rows(X)
|
|
return X, y
|
|
|
|
def __iter__(self) -> Diterator:
|
|
return Diterator(self.data_sets)
|
|
|
|
def __str__(self) -> str:
|
|
out = ""
|
|
for dataset in self.data_sets:
|
|
out += f" {dataset[0]},"
|
|
return out
|
|
|
|
def report(self) -> None:
|
|
print(
|
|
TextColor.HEADER
|
|
+ "(#) Dataset Samples Feat. #Cl. y typ "
|
|
+ "X type f_type"
|
|
)
|
|
print(
|
|
"=== ============================= ======== ===== ==== ===== "
|
|
"======= ======" + TextColor.ENDC
|
|
)
|
|
color = TextColor.LINE2
|
|
for number, dataset in enumerate(self.data_sets):
|
|
X, y = self.load(dataset[0]) # type: ignore
|
|
samples, features = X.shape
|
|
classes = len(np.unique(y))
|
|
color = (
|
|
TextColor.LINE1
|
|
if color == TextColor.LINE2
|
|
else TextColor.LINE2
|
|
)
|
|
print(
|
|
color + f"#{number + 1:02d} {dataset[0]:30s} {samples:7,d} "
|
|
f"{features:5d} {classes:4d} {y.dtype} {str(X.dtype):7s} "
|
|
f"{dataset[1]}" + TextColor.ENDC
|
|
)
|
|
# Check dataset is Ok
|
|
# data type
|
|
if str(X.dtype) != dataset[2]:
|
|
raise ValueError(
|
|
f"dataset {dataset[0]} has wrong data type. "
|
|
f"It shoud have {dataset[2]} but has {X.dtype}"
|
|
)
|
|
# dimensions
|
|
if X.shape != dataset[3]:
|
|
raise ValueError(
|
|
f"dataset {dataset[0]} has wrong X shape. "
|
|
f"It shoud have {dataset[3]} but has {X.shape}"
|
|
)
|
|
if y.shape != (X.shape[0],):
|
|
raise ValueError(
|
|
f"dataset {dataset[0]} has wrong y shape. "
|
|
f"It shoud have {(X.shape[0],)} but has {y.shape}"
|
|
)
|
|
|
|
print(
|
|
TextColor.SUCCESS
|
|
+ "* All Data types and shapes are Ok."
|
|
+ TextColor.ENDC
|
|
)
|
|
|
|
|
|
class Datasets_Tanveer(Dataset_Base):
|
|
def __init__(
|
|
self, normalize: bool = False, standardize: bool = False
|
|
) -> None:
|
|
super().__init__(normalize, standardize)
|
|
self._folder = os.path.join(self._data_folder, "tanveer")
|
|
self.data_sets: tsets = [
|
|
# (name), (filetype), (sampes type)
|
|
("balance-scale", "Rdat", "float64", (625, 4)),
|
|
("balloons", "Rdat", "float64", (16, 4)),
|
|
("breast-cancer-wisc-diag", "Rdat", "float64", (569, 30)),
|
|
("breast-cancer-wisc-prog", "Rdat", "float64", (198, 33)),
|
|
("breast-cancer-wisc", "Rdat", "float64", (699, 9)),
|
|
("breast-cancer", "Rdat", "float64", (286, 9)),
|
|
("cardiotocography-10clases", "Rdat", "float64", (2126, 21)),
|
|
("cardiotocography-3clases", "Rdat", "float64", (2126, 21)),
|
|
("conn-bench-sonar-mines-rocks", "Rdat", "float64", (208, 60)),
|
|
("cylinder-bands", "Rdat", "float64", (512, 35)),
|
|
("dermatology", "Rdat", "float64", (366, 34)),
|
|
("echocardiogram", "Rdat", "float64", (131, 10)),
|
|
("fertility", "Rdat", "float64", (100, 9)),
|
|
("haberman-survival", "Rdat", "float64", (306, 3)),
|
|
("heart-hungarian", "Rdat", "float64", (294, 12)),
|
|
("hepatitis", "Rdat", "float64", (155, 19)),
|
|
("ilpd-indian-liver", "Rdat", "float64", (583, 9)),
|
|
("ionosphere", "Rdat", "float64", (351, 33)),
|
|
("iris", "Rdat", "float64", (150, 4)),
|
|
("led-display", "Rdat", "float64", (1000, 7)),
|
|
("libras", "Rdat", "float64", (360, 90)),
|
|
("low-res-spect", "Rdat", "float64", (531, 100)),
|
|
("lymphography", "Rdat", "float64", (148, 18)),
|
|
("mammographic", "Rdat", "float64", (961, 5)),
|
|
("molec-biol-promoter", "Rdat", "float64", (106, 57)),
|
|
("musk-1", "Rdat", "float64", (476, 166)),
|
|
("oocytes_merluccius_nucleus_4d", "Rdat", "float64", (1022, 41)),
|
|
("oocytes_merluccius_states_2f", "Rdat", "float64", (1022, 25)),
|
|
("oocytes_trisopterus_nucleus_2f", "Rdat", "float64", (912, 25)),
|
|
("oocytes_trisopterus_states_5b", "Rdat", "float64", (912, 32)),
|
|
("parkinsons", "Rdat", "float64", (195, 22)),
|
|
("pima", "Rdat", "float64", (768, 8)),
|
|
("pittsburg-bridges-MATERIAL", "Rdat", "float64", (106, 7)),
|
|
("pittsburg-bridges-REL-L", "Rdat", "float64", (103, 7)),
|
|
("pittsburg-bridges-SPAN", "Rdat", "float64", (92, 7)),
|
|
("pittsburg-bridges-T-OR-D", "Rdat", "float64", (102, 7)),
|
|
("planning", "Rdat", "float64", (182, 12)),
|
|
("post-operative", "Rdat", "float64", (90, 8)),
|
|
("seeds", "Rdat", "float64", (210, 7)),
|
|
("statlog-australian-credit", "Rdat", "float64", (690, 14)),
|
|
("statlog-german-credit", "Rdat", "float64", (1000, 24)),
|
|
("statlog-heart", "Rdat", "float64", (270, 13)),
|
|
("statlog-image", "Rdat", "float64", (2310, 18)),
|
|
("statlog-vehicle", "Rdat", "float64", (846, 18)),
|
|
("synthetic-control", "Rdat", "float64", (600, 60)),
|
|
("tic-tac-toe", "Rdat", "float64", (958, 9)),
|
|
("vertebral-column-2clases", "Rdat", "float64", (310, 6)),
|
|
("wine", "Rdat", "float64", (178, 13)),
|
|
("zoo", "Rdat", "float64", (101, 16)),
|
|
]
|
|
|
|
def load(self, name: str) -> tdataset:
|
|
data = pd.read_csv(
|
|
os.path.join(self._folder, name, f"{name}_R.dat"),
|
|
sep="\t",
|
|
index_col=0,
|
|
)
|
|
X = data.drop("clase", axis=1).to_numpy()
|
|
y = data["clase"].to_numpy()
|
|
return self.post_process(X, y) # type: ignore
|
|
|
|
|
|
class Datasets_AAAI(Dataset_Base):
|
|
def __init__(
|
|
self, normalize: bool = False, standardize: bool = False
|
|
) -> None:
|
|
super().__init__(normalize, standardize)
|
|
self._folder: str = os.path.join(self._data_folder, "aaai")
|
|
self.data_sets: tsets = [
|
|
# (name), (filetype), (sampes type)
|
|
("breast", "csv", "int16", (683, 9)),
|
|
("cardiotoc", "csv", "int16", (2126, 41)),
|
|
("cod-rna", "sparse", "float16", (331152, 8)),
|
|
("connect4", "sparse", "int16", (67557, 126)),
|
|
("covtype", "npz", "int16", (581012, 54)),
|
|
("diabetes", "csv", "float16", (768, 8)),
|
|
("dna", "csv", "float16", (3186, 180)),
|
|
("fourclass", "sparse", "int16", (862, 2)),
|
|
("glass", "csv", "float16", (214, 9)),
|
|
("heart", "csv", "float16", (270, 13)),
|
|
("ijcnn1", "sparse", "float16", (141691, 22)),
|
|
("iris", "csv", "float16", (150, 4)),
|
|
("letter", "npz", "int16", (20000, 16)),
|
|
("mnist", "npy", "int16", (70000, 784)),
|
|
("pendigits", "npy", "int16", (10992, 16)),
|
|
("protein", "sparse", "float16", (24387, 357)),
|
|
("satimage", "npy", "int16", (6435, 36)),
|
|
("segment", "sparse", "float16", (2310, 19)),
|
|
("shuttle", "npy", "int16", (58000, 9)),
|
|
("usps", "npz", "float16", (9298, 256)),
|
|
("vehicle", "sparse", "float16", (846, 18)),
|
|
("wine", "csv", "float16", (178, 13)),
|
|
]
|
|
|
|
def load_dataset(
|
|
self, name: str, file_type: str, data_type: str
|
|
) -> tdataset:
|
|
return getattr(self, f"load_{file_type}_dataset")(name, data_type)
|
|
|
|
def load(self, name: str) -> tdataset:
|
|
for dataset in self.data_sets:
|
|
if name == dataset[0]:
|
|
return self.post_process(
|
|
*self.load_dataset(*dataset[:3]) # type: ignore
|
|
)
|
|
raise ValueError(
|
|
f"{name} is not a valid dataset, has to be one of {str(self)}"
|
|
)
|
|
|
|
def load_csv_dataset(self, name: str, dtype: str) -> tdataset:
|
|
data = np.genfromtxt(
|
|
os.path.join(self._folder, f"{name}.csv"),
|
|
delimiter=",",
|
|
dtype=dtype,
|
|
)
|
|
features = data.shape[1]
|
|
return data[:, : features - 1], data[:, -1].astype(np.int16)
|
|
|
|
def load_npy_dataset(self, name: str, _: str) -> tdataset:
|
|
data = np.load(os.path.join(self._folder, f"{name}.npy"))
|
|
features = data.shape[1]
|
|
return data[:, : features - 1], data[:, -1].astype(np.int16)
|
|
|
|
def load_npz_dataset(self, name, _):
|
|
data = np.load(os.path.join(self._folder, f"{name}.npz"))
|
|
return data["arr_0"], data["arr_1"]
|
|
|
|
def load_sparse_dataset(self, name: str, _: str) -> tdataset:
|
|
X, y = np.load(
|
|
os.path.join(self._folder, f"{name}.npy"), allow_pickle=True
|
|
)
|
|
if str(X.dtype) == "float16":
|
|
# can't do todense with np.float16
|
|
XX = X.astype(np.float64).todense().astype(np.float16)
|
|
else:
|
|
XX = X.todense()
|
|
return XX, y
|
|
|
|
|
|
class Datasets:
|
|
def __init__(
|
|
self,
|
|
normalize: bool = False,
|
|
standardize: bool = False,
|
|
set_of_files: str = "aaai",
|
|
) -> None:
|
|
self._model = (
|
|
Datasets_AAAI(normalize, standardize)
|
|
if set_of_files == "aaai"
|
|
else Datasets_Tanveer(normalize, standardize)
|
|
)
|
|
|
|
def load(self, name: str) -> tdataset:
|
|
return self._model.load(name)
|
|
|
|
def post_process(self, X: np.array, y: np.array) -> tdataset:
|
|
return self._model.post_process(X, y)
|
|
|
|
def report(self) -> None:
|
|
return self._model.report()
|
|
|
|
def get_params(self) -> str:
|
|
return self._model.get_params()
|
|
|
|
def __iter__(self) -> Diterator:
|
|
return Diterator(self._model.data_sets)
|
|
|
|
def __str__(self) -> str:
|
|
return self._model.__str__()
|