diff --git a/Makefile b/Makefile index ca256b9..542d18e 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,9 @@ push: ## Push code with tags test: ## Run tests python -m unittest -v stree.tests +doc: ## Update documentation + make -C docs --makefile=Makefile html + help: ## Show help message @IFS=$$'\n' ; \ help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \ @@ -36,4 +39,4 @@ help: ## Show help message printf "%-20s %s" $$help_command ; \ printf '\033[0m'; \ printf "%s\n" $$help_info; \ - done + done \ No newline at end of file diff --git a/README.md b/README.md index 860b901..f8c4139 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ Can be found in | | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features).
Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. | | | min_samples_split | \ | 0 | The minimum number of samples required to split an internal node. 0 (default) for any | | | max_features | \, \

or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:
If int, then consider max_features features at each split.
If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.
If “auto”, then max_features=sqrt(n_features).
If “sqrt”, then max_features=sqrt(n_features).
If “log2”, then max_features=log2(n_features).
If None, then max_features=n_features. | -| | splitter | {"best", "random"} | random | The strategy used to choose the feature set at each node (only used if max_features != num_features).
Supported strategies are “best” to choose the best feature set and “random” to choose a random combination.
The algorithm generates 5 candidates at most to choose from in both strategies. | +| | splitter | {"best", "random"} | random | The strategy used to choose the feature set at each node (only used if max_features < num_features).
Supported strategies are “best” to choose the best feature set and “random” to choose a random combination.
The algorithm generates 5 candidates at most to choose from if random is selected. If best is selected sklearn SelectKBest algorithm is used in every node to choose the _max_features_ best features | | | normalize | \ | False | If standardization of features should be applied on each node with the samples that reach it | \* Hyperparameter used by the support vector classifier of every node diff --git a/setup.py b/setup.py index b56823d..959d6dd 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,11 @@ setuptools.setup( long_description=readme(), long_description_content_type="text/markdown", packages=setuptools.find_packages(), - url=stree.__url__, + url="https://github.com/Doctorado-ML/STree#stree", + project_urls={ + "Code": "https://github.com/Doctorado-ML/STree", + "Documentation": "https://stree.readthedocs.io/en/latest/index.html", + }, author=stree.__author__, author_email=stree.__author_email__, keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\ diff --git a/stree/Strees.py b/stree/Strees.py index 2f40eb1..3062364 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -653,12 +653,12 @@ class Stree(BaseEstimator, ClassifierMixin): self.n_features_ = X.shape[1] self.n_features_in_ = X.shape[1] self.max_features_ = self._initialize_max_features() - self.tree_ = self.train(X, y, sample_weight, 1, "root") + self.tree_ = self._train(X, y, sample_weight, 1, "root") self.X_ = X self.y_ = y return self - def train( + def _train( self, X: np.ndarray, y: np.ndarray, @@ -723,10 +723,10 @@ class Stree(BaseEstimator, ClassifierMixin): node.make_predictor() return node node.set_up( - self.train(X_U, y_u, sw_u, depth + 1, title + f" - Up({depth+1})") + self._train(X_U, y_u, sw_u, depth + 1, title + f" - Up({depth+1})") ) node.set_down( - self.train( + self._train( X_D, y_d, sw_d, depth + 1, title + f" - Down({depth+1})" ) ) @@ -892,6 +892,12 @@ class Stree(BaseEstimator, ClassifierMixin): elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, numbers.Integral): + if self.max_features > self.n_features_: + raise ValueError( + "Invalid value for max_features. " + "It can not be greater than number of features " + f"({self.n_features_})" + ) max_features = self.max_features else: # float if self.max_features > 0.0: diff --git a/stree/__init__.py b/stree/__init__.py index d58a553..eddafae 100644 --- a/stree/__init__.py +++ b/stree/__init__.py @@ -6,6 +6,5 @@ __author__ = "Ricardo Montañana Gómez" __copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez" __license__ = "MIT License" __author_email__ = "ricardo.montanana@alu.uclm.es" -__url__ = "https://github.com/doctorado-ml/stree" __all__ = ["Stree", "Snode", "Siterator", "Splitter"] diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py index de9861c..d05b322 100644 --- a/stree/tests/Stree_test.py +++ b/stree/tests/Stree_test.py @@ -269,6 +269,12 @@ class Stree_test(unittest.TestCase): with self.assertRaises(ValueError): _ = clf._initialize_max_features() + def test_wrong_max_features(self): + X, y = load_dataset(n_features=15) + clf = Stree(max_features=16) + with self.assertRaises(ValueError): + clf.fit(X, y) + def test_get_subspaces(self): dataset = np.random.random((10, 16)) y = np.random.randint(0, 2, 10)