From 36ff3da26d8f200c4e1f75b2430e1ef96005a4be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Mon, 13 Sep 2021 18:32:59 +0200 Subject: [PATCH] Update Docs --- docs/source/stree.md | 5 +- stree/Splitter.py | 72 +++++++++++++++++++++++++++- stree/Strees.py | 109 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 183 insertions(+), 3 deletions(-) diff --git a/docs/source/stree.md b/docs/source/stree.md index 5f014f5..1710485 100644 --- a/docs/source/stree.md +++ b/docs/source/stree.md @@ -1,9 +1,12 @@ # STree -[![Codeship Status for Doctorado-ML/STree](https://app.codeship.com/projects/8b2bd350-8a1b-0138-5f2c-3ad36f3eb318/status?branch=master)](https://app.codeship.com/projects/399170) +![CI](https://github.com/Doctorado-ML/STree/workflows/CI/badge.svg) [![codecov](https://codecov.io/gh/doctorado-ml/stree/branch/master/graph/badge.svg)](https://codecov.io/gh/doctorado-ml/stree) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/35fa3dfd53a24a339344b33d9f9f2f3d)](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade) [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/Doctorado-ML/STree.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/Doctorado-ML/STree/context:python) +[![PyPI version](https://badge.fury.io/py/STree.svg)](https://badge.fury.io/py/STree) +![https://img.shields.io/badge/python-3.8%2B-blue](https://img.shields.io/badge/python-3.8%2B-brightgreen) +[![DOI](https://zenodo.org/badge/262658230.svg)](https://zenodo.org/badge/latestdoi/262658230) Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc. diff --git a/stree/Splitter.py b/stree/Splitter.py index b972992..2f6e590 100644 --- a/stree/Splitter.py +++ b/stree/Splitter.py @@ -16,8 +16,28 @@ from mufs import MUFS class Snode: - """Nodes of the tree that keeps the svm classifier and if testing the + """ + Nodes of the tree that keeps the svm classifier and if testing the dataset assigned to it + + Parameters + ---------- + clf : SVC + Classifier used + X : np.ndarray + input dataset in train time (only in testing) + y : np.ndarray + input labes in train time + features : np.array + features used to compute hyperplane + impurity : float + impurity of the node + title : str + label describing the route to the node + weight : np.ndarray, optional + weights applied to input dataset in train time, by default None + scaler : StandardScaler, optional + scaler used if any, by default None """ def __init__( @@ -165,6 +185,55 @@ class Siterator: class Splitter: + """ + Splits a dataset in two based on different criteria + + Parameters + ---------- + clf : SVC, optional + classifier, by default None + criterion : str, optional + The function to measure the quality of a split (only used if + max_features != num_features). Supported criteria are “gini” for the + Gini impurity and “entropy” for the information gain., by default + "entropy", by default None + feature_select : str, optional + The strategy used to choose the feature set at each node (only used if + max_features < num_features). Supported strategies are: “best”: sklearn + SelectKBest algorithm is used in every node to choose the max_features + best features. “random”: The algorithm generates 5 candidates and + choose the best (max. info. gain) of them. "mutual": Chooses the best + features w.r.t. their mutual info with the label. "cfs": Apply + Correlation-based Feature Selection. "fcbf": Apply Fast Correlation- + Based, by default None + criteria : str, optional + ecides (just in case of a multi class classification) which column + (class) use to split the dataset in a node. max_samples is + incompatible with 'ovo' multiclass_strategy, by default None + min_samples_split : int, optional + The minimum number of samples required to split an internal node. 0 + (default) for any, by default None + random_state : optional + Controls the pseudo random number generation for shuffling the data for + probability estimates. Ignored when probability is False.Pass an int + for reproducible output across multiple function calls, by + default None + normalize : bool, optional + If standardization of features should be applied on each node with the + samples that reach it , by default False + + Raises + ------ + ValueError + clf has to be a sklearn estimator + ValueError + criterion must be gini or entropy + ValueError + criteria has to be max_samples or impurity + ValueError + splitter must be in {random, best, mutual, cfs, fcbf} + """ + def __init__( self, clf: SVC = None, @@ -175,6 +244,7 @@ class Splitter: random_state=None, normalize=False, ): + self._clf = clf self._random_state = random_state if random_state is not None: diff --git a/stree/Strees.py b/stree/Strees.py index 8c033f7..a17bbc6 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -20,11 +20,117 @@ from .Splitter import Splitter, Snode, Siterator class Stree(BaseEstimator, ClassifierMixin): - """Estimator that is based on binary trees of svm nodes + """ + Estimator that is based on binary trees of svm nodes can deal with sample_weights in predict, used in boosting sklearn methods inheriting from BaseEstimator implements get_params and set_params methods inheriting from ClassifierMixin implement the attribute _estimator_type with "classifier" as value + + Parameters + ---------- + C : float, optional + Regularization parameter. The strength of the regularization is + inversely proportional to C. Must be strictly positive., by default 1.0 + kernel : str, optional + Specifies the kernel type to be used in the algorithm. It must be one + of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses + [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and + the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) + library through scikit-learn library, by default "linear" + max_iter : int, optional + Hard limit on iterations within solver, or -1 for no limit., by default + 1e5 + random_state : int, optional + Controls the pseudo random number generation for shuffling the data for + probability estimates. Ignored when probability is False.Pass an int + for reproducible output across multiple function calls, by + default None + max_depth : int, optional + Specifies the maximum depth of the tree, by default None + tol : float, optional + Tolerance for stopping, by default 1e-4 + degree : int, optional + Degree of the polynomial kernel function (‘poly’). Ignored by all other + kernels., by default 3 + gamma : str, optional + Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.if gamma='scale' + (default) is passed then it uses 1 / (n_features * X.var()) as value + of gamma,if ‘auto’, uses 1 / n_features., by default "scale" + split_criteria : str, optional + Decides (just in case of a multi class classification) which column + (class) use to split the dataset in a node. max_samples is + incompatible with 'ovo' multiclass_strategy, by default "impurity" + criterion : str, optional + The function to measure the quality of a split (only used if + max_features != num_features). Supported criteria are “gini” for the + Gini impurity and “entropy” for the information gain., by default + "entropy" + min_samples_split : int, optional + The minimum number of samples required to split an internal node. 0 + (default) for any, by default 0 + max_features : optional + The number of features to consider when looking for the split: If int, + then consider max_features features at each split. If float, then + max_features is a fraction and int(max_features * n_features) features + are considered at each split. If “auto”, then max_features= + sqrt(n_features). If “sqrt”, then max_features=sqrt(n_features). If + “log2”, then max_features=log2(n_features). If None, then max_features= + n_features., by default None + splitter : str, optional + The strategy used to choose the feature set at each node (only used if + max_features < num_features). Supported strategies are: “best”: sklearn + SelectKBest algorithm is used in every node to choose the max_features + best features. “random”: The algorithm generates 5 candidates and + choose the best (max. info. gain) of them. "mutual": Chooses the best + features w.r.t. their mutual info with the label. "cfs": Apply + Correlation-based Feature Selection. "fcbf": Apply Fast Correlation- + Based , by default "random" + multiclass_strategy : str, optional + Strategy to use with multiclass datasets, "ovo": one versus one. "ovr": + one versus rest, by default "ovo" + normalize : bool, optional + If standardization of features should be applied on each node with the + samples that reach it , by default False + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + The classes labels. + + n_classes_ : int + The number of classes + + n_iter_ : int + Max number of iterations in classifier + + depth_ : int + Max depht of the tree + + n_features_ : int + The number of features when ``fit`` is performed. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + max_features_ : int + Number of features to use in hyperplane computation + + tree_ : Node + root of the tree + + X_ : ndarray + points to the input dataset + + y_ : ndarray + points to the input labels + + References + ---------- + R. Montañana, J. A. Gámez, J. M. Puerta, "STree: a single multi-class + oblique decision tree based on support vector machines.", 2021 LNAI... + + """ def __init__( @@ -45,6 +151,7 @@ class Stree(BaseEstimator, ClassifierMixin): multiclass_strategy: str = "ovo", normalize: bool = False, ): + self.max_iter = max_iter self.C = C self.kernel = kernel