mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-15 23:46:02 +00:00
committed by
GitHub
parent
36ff3da26d
commit
36b08b1bcf
@@ -6,7 +6,6 @@
|
||||

|
||||
[](https://zenodo.org/badge/latestdoi/262658230)
|
||||
|
||||
|
||||
# STree
|
||||
|
||||
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
||||
@@ -40,7 +39,7 @@ Can be found in [stree.readthedocs.io](https://stree.readthedocs.io/en/stable/)
|
||||
## Hyperparameters
|
||||
|
||||
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
||||
| --- | ------------------- | ------------------------------------------------------ | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| --- | ------------------- | ------------------------------------------------------ | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
||||
| \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
|
||||
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
||||
@@ -53,11 +52,10 @@ Can be found in [stree.readthedocs.io](https://stree.readthedocs.io/en/stable/)
|
||||
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
||||
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
||||
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
||||
| | splitter | {"best", "random", "mutual", "cfs", "fcbf"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose the best (max. info. gain) of them. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label. **"cfs"**: Apply Correlation-based Feature Selection. **"fcbf"**: Apply Fast Correlation-Based Filter |
|
||||
| | splitter | {"best", "random", "mutual", "cfs", "fcbf", "iwss"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose the best (max. info. gain) of them. **“trandom”**: The algorithm generates a true random combination. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label. **"cfs"**: Apply Correlation-based Feature Selection. **"fcbf"**: Apply Fast Correlation-Based Filter. **"iwss"**: IWSS based algorithm |
|
||||
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
||||
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest |
|
||||
|
||||
|
||||
\* Hyperparameter used by the support vector classifier of every node
|
||||
|
||||
\*\* **Splitting in a STree node**
|
||||
|
@@ -1,7 +1,7 @@
|
||||
# Hyperparameters
|
||||
|
||||
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
||||
| --- | ------------------- | ------------------------------------------------------ | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| --- | ------------------- | ------------------------------------------------------ | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
||||
| \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
|
||||
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
||||
@@ -14,7 +14,7 @@
|
||||
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
||||
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
||||
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
||||
| | splitter | {"best", "random", "mutual", "cfs", "fcbf"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose the best (max. info. gain) of them. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label. **"cfs"**: Apply Correlation-based Feature Selection. **"fcbf"**: Apply Fast Correlation-Based Filter |
|
||||
| | splitter | {"best", "random", "mutual", "cfs", "fcbf", "iwss"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose the best (max. info. gain) of them. **“trandom”**: The algorithm generates a true random combination. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label. **"cfs"**: Apply Correlation-based Feature Selection. **"fcbf"**: Apply Fast Correlation-Based Filter. **"iwss"**: IWSS based algorithm |
|
||||
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
||||
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest |
|
||||
|
||||
|
@@ -271,10 +271,17 @@ class Splitter:
|
||||
f"criteria has to be max_samples or impurity; got ({criteria})"
|
||||
)
|
||||
|
||||
if feature_select not in ["random", "best", "mutual", "cfs", "fcbf"]:
|
||||
if feature_select not in [
|
||||
"random",
|
||||
"best",
|
||||
"mutual",
|
||||
"cfs",
|
||||
"fcbf",
|
||||
"iwss",
|
||||
]:
|
||||
raise ValueError(
|
||||
"splitter must be in {random, best, mutual, cfs, fcbf} got "
|
||||
f"({feature_select})"
|
||||
"splitter must be in {random, best, mutual, cfs, fcbf, iwss} "
|
||||
f"got ({feature_select})"
|
||||
)
|
||||
self.criterion_function = getattr(self, f"_{self._criterion}")
|
||||
self.decision_criteria = getattr(self, f"_{self._criteria}")
|
||||
@@ -409,6 +416,31 @@ class Splitter:
|
||||
mufs = MUFS(max_features=max_features, discrete=False)
|
||||
return mufs.fcbf(dataset, labels, 5e-4).get_results()
|
||||
|
||||
@staticmethod
|
||||
def _fs_iwss(
|
||||
dataset: np.array, labels: np.array, max_features: int
|
||||
) -> tuple:
|
||||
"""Correlattion-based feature selection based on iwss with max_features
|
||||
limit
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset : np.array
|
||||
array of samples
|
||||
labels : np.array
|
||||
labels of the dataset
|
||||
max_features : int
|
||||
number of features of the subspace
|
||||
(< number of features in dataset)
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple
|
||||
indices of the features selected
|
||||
"""
|
||||
mufs = MUFS(max_features=max_features, discrete=False)
|
||||
return mufs.iwss(dataset, labels, 0.25).get_results()
|
||||
|
||||
def partition_impurity(self, y: np.array) -> np.array:
|
||||
return self.criterion_function(y)
|
||||
|
||||
|
@@ -285,3 +285,15 @@ class Splitter_test(unittest.TestCase):
|
||||
Xs, computed = tcl.get_subspace(X, y, rs)
|
||||
self.assertListEqual(expected, list(computed))
|
||||
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||
|
||||
def test_get_iwss_subspaces(self):
|
||||
results = [
|
||||
(4, [1, 5, 9, 12]),
|
||||
(6, [1, 5, 9, 12, 4, 15]),
|
||||
]
|
||||
for rs, expected in results:
|
||||
X, y = load_dataset(n_features=20, n_informative=7)
|
||||
tcl = self.build(feature_select="iwss", random_state=rs)
|
||||
Xs, computed = tcl.get_subspace(X, y, rs)
|
||||
self.assertListEqual(expected, list(computed))
|
||||
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||
|
Reference in New Issue
Block a user