diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 0ab0c91..26d1d67 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -13,12 +13,12 @@ jobs: strategy: matrix: os: [macos-latest, ubuntu-latest, windows-latest] - python: [3.8, 3.9, "3.10"] + python: [3.11, 3.12] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} - name: Install dependencies @@ -35,7 +35,7 @@ jobs: coverage run -m unittest -v odte.tests coverage xml - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 + uses: codecov/codecov-action@v4 with: token: ${{ secrets.CODECOV_TOKEN }} files: ./coverage.xml diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..8901566 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,14 @@ +version: 2 + +sphinx: + configuration: docs/source/conf.py + +build: + os: ubuntu-22.04 + tools: + python: "3.12" + +python: + install: + - requirements: requirements.txt + - requirements: docs/requirements.txt diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..7152b80 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include README.md LICENSE \ No newline at end of file diff --git a/Makefile b/Makefile index ce4af08..c2772a5 100644 --- a/Makefile +++ b/Makefile @@ -1,44 +1,35 @@ SHELL := /bin/bash .DEFAULT_GOAL := help -.PHONY: coverage deps help lint push test doc build +.PHONY: audit coverage help lint test doc doc-clean build coverage: ## Run tests with coverage - coverage erase - coverage run -m unittest -v odte.tests - coverage report -m + @coverage erase + @coverage run -m unittest -v odte.tests + @coverage report -m -deps: ## Install dependencies - pip install -r requirements.txt - -devdeps: ## Install development dependencies - pip install black pip-audit flake8 mypy coverage - -lint: ## Lint and static-check - black odte - flake8 odte - mypy odte --exclude tests +lint: ## Lint source files + @black odte + @flake8 odte + @mypy odte audit: ## Audit pip - pip-audit - -push: ## Push code with tags - git push && git push --tags + @pip-audit test: ## Run tests - python -m unittest -v odte.tests + @python -m unittest -v odte.tests doc: ## Update documentation - make -C docs --makefile=Makefile html + @make -C docs --makefile=Makefile html build: ## Build package - rm -fr dist/* - rm -fr build/* - python setup.py sdist bdist_wheel + @rm -fr dist/* + @rm -fr build/* + @hatch build -doc-clean: ## Update documentation - make -C docs --makefile=Makefile clean +doc-clean: ## Clean documentation folders + @make -C docs --makefile=Makefile clean -help: ## Show help message +help: ## Show this help message @IFS=$$'\n' ; \ help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \ printf "%s\n\n" "Usage: make [task]"; \ diff --git a/README.md b/README.md index 2939904..5011a65 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,9 @@ [![codecov](https://codecov.io/gh/Doctorado-ML/odte/branch/master/graph/badge.svg)](https://codecov.io/gh/Doctorado-ML/odte) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/f4b5ef87584b4095b6e49aefbe594c82)](https://www.codacy.com/gh/Doctorado-ML/Odte/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/Odte&utm_campaign=Badge_Grade) [![PyPI version](https://badge.fury.io/py/Odte.svg)](https://badge.fury.io/py/Odte) -![https://img.shields.io/badge/python-3.8%2B-blue](https://img.shields.io/badge/python-3.8%2B-brightgreen) +![https://img.shields.io/badge/python-3.11%2B-blue](https://img.shields.io/badge/python-3.11%2B-brightgreen) [![DOI](https://zenodo.org/badge/271595804.svg)](https://zenodo.org/badge/latestdoi/271595804) # Odte -Oblique Decision Tree Ensemble +Oblique Decision Tree Ensemble classifier based on [STree](https://github.com/doctorado-ml/stree) nodes. diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..a8856fd --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,4 @@ +sphinx +sphinx-rtd-theme +myst-parser +stree \ No newline at end of file diff --git a/docs/source/api/Odte.rst b/docs/source/api/Odte.rst new file mode 100644 index 0000000..392d289 --- /dev/null +++ b/docs/source/api/Odte.rst @@ -0,0 +1,10 @@ +Odte +===== + +.. automodule:: odte +.. autoclass:: Odte + :members: + :undoc-members: + :private-members: + :show-inheritance: + :noindex: \ No newline at end of file diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst new file mode 100644 index 0000000..26c39da --- /dev/null +++ b/docs/source/api/index.rst @@ -0,0 +1,8 @@ +API index +========= + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + Odte diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..c60c5a3 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,54 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) +import odte + +# -- Project information ----------------------------------------------------- + +project = "Odte" +copyright = "2024 Ricardo Montañana Gómez" +author = "Ricardo Montañana Gómez" + +# The full version, including alpha/beta/rc tags +version = release = odte.__version__ + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ["myst_parser", "sphinx.ext.autodoc", "sphinx.ext.viewcode"] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = [] diff --git a/docs/source/example.png b/docs/source/example.png new file mode 100644 index 0000000..d4492e7 Binary files /dev/null and b/docs/source/example.png differ diff --git a/docs/source/hyperparameters.md b/docs/source/hyperparameters.md new file mode 100644 index 0000000..1fa52cd --- /dev/null +++ b/docs/source/hyperparameters.md @@ -0,0 +1,13 @@ +# Hyperparameters + +| | **Hyperparameter** | **Type/Values** | **Default** | | +| --- | ------------------- | -------------------------------------------------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| | estimator | \ | Stree() | Base estimator used to build each element of the ensemble. | +| | n_jobs | \ | -1 | Specifies the number of threads used to build the ensemble (-1 equals to all cores available) | +| | random_state | \ | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.
Pass an int for reproducible output across multiple function calls | +| | max_features | \, \

or {“auto”, “sqrt”, “log2”} | None | The number of features to consider in each tree:
\ _max_features_ features for each tree.
\ _max_features_ is a fraction and int(_max_features_ \* _n_features_) features are considered for each tree.
“auto” _max_features_=sqrt(_n_features_)
“sqrt” _max_features_=sqrt(_n_features_)
“log2” _max_features_=log2(_n_features_)
_None_ _max_features_=_n_features_ | +| | max_samples | \, \ | None |The number of samples to consider for bootstrap:
\ _max_samples_ samples for each tree.
\ _max_samples_ is a fraction and int(_max_samples_ \* _n_samples_) samples for each tree. | +| | n_estimators | \ | 100 | The number of trees the ensemble is going to build | +| | be_hyperparams | \ | "{}" | Hyperparameteres passed to the base estimator, i.e. "{\\"C\\": 17, \\"kernel\\": \\"rbf\\"}"| + + diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..8f699e4 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,14 @@ +Welcome to Odte's documentation! +================================= + +.. toctree:: + :caption: Contents: + :titlesonly: + + + odte + install + hyperparameters + api/index + +* :ref:`genindex` \ No newline at end of file diff --git a/docs/source/install.rst b/docs/source/install.rst new file mode 100644 index 0000000..c7db158 --- /dev/null +++ b/docs/source/install.rst @@ -0,0 +1,15 @@ +Install +======= + +The main stable release + +``pip install odte`` + +or the last development branch + +``pip install git+https://github.com/doctorado-ml/odte`` + +Tests +***** + +``python -m unittest -v odte.tests`` \ No newline at end of file diff --git a/docs/source/odte.md b/docs/source/odte.md new file mode 100644 index 0000000..11dc019 --- /dev/null +++ b/docs/source/odte.md @@ -0,0 +1,17 @@ +# Odte + +![CI](https://github.com/Doctorado-ML/Odte/workflows/CI/badge.svg) +[![CodeQL](https://github.com/Doctorado-ML/Odte/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/Doctorado-ML/Odte/actions/workflows/codeql-analysis.yml) +[![codecov](https://codecov.io/gh/Doctorado-ML/odte/branch/master/graph/badge.svg)](https://codecov.io/gh/Doctorado-ML/odte) +[![Codacy Badge](https://app.codacy.com/project/badge/Grade/f4b5ef87584b4095b6e49aefbe594c82)](https://www.codacy.com/gh/Doctorado-ML/Odte/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/Odte&utm_campaign=Badge_Grade) +[![PyPI version](https://badge.fury.io/py/Odte.svg)](https://badge.fury.io/py/Odte) +![https://img.shields.io/badge/python-3.11%2B-blue](https://img.shields.io/badge/python-3.11%2B-brightgreen) +[![DOI](https://zenodo.org/badge/271595804.svg)](https://zenodo.org/badge/latestdoi/271595804) + +Oblique Decision Tree Ensemble classifier based on [STree](https://github.com/doctorado-ml/stree) nodes. + +![Odte](./example.png) + +## License + +Odte is [MIT](https://github.com/doctorado-ml/odte/blob/master/LICENSE) licensed diff --git a/notebooks/benchmark.ipynb b/notebooks/benchmark.ipynb deleted file mode 100644 index a16d30d..0000000 --- a/notebooks/benchmark.ipynb +++ /dev/null @@ -1,388 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Compare Odte with different estimators" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Setup\n", - "Uncomment the next cell if Odte is not already installed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#\n", - "# Google Colab setup\n", - "#\n", - "#!pip install git+https://github.com/doctorado-ml/odte\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import datetime, time\n", - "import numpy as np\n", - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn import tree\n", - "from sklearn.metrics import classification_report, confusion_matrix, f1_score\n", - "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier\n", - "from stree import Stree\n", - "from odte import Odte" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "if not os.path.isfile('data/creditcard.csv'):\n", - " !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n", - " !tar xzf creditcard.tgz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Tests" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(datetime.date.today(), time.strftime(\"%H:%M:%S\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load dataset and normalize values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load Dataset\n", - "df = pd.read_csv('data/creditcard.csv')\n", - "df.shape\n", - "random_state = 2020" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n", - "print(\"Valid: {0:.3f}% {1:,}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Normalize Amount\n", - "from sklearn.preprocessing import RobustScaler\n", - "values = RobustScaler().fit_transform(df.Amount.values.reshape(-1, 1))\n", - "df['Amount_Scaled'] = values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Remove unneeded features\n", - "y = df.Class.values\n", - "X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n", - "print(f\"X shape: {X.shape}\\ny shape: {y.shape}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build the models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Divide dataset\n", - "train_size = .7\n", - "Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size, shuffle=True, random_state=random_state, stratify=y)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Linear Tree\n", - "linear_tree = tree.DecisionTreeClassifier(random_state=random_state)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Random Forest\n", - "random_forest = RandomForestClassifier(random_state=random_state)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Stree\n", - "stree = Stree(random_state=random_state, C=.01)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# AdaBoost\n", - "adaboost = AdaBoostClassifier(random_state=random_state)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Gradient Boosting\n", - "gradient = GradientBoostingClassifier(random_state=random_state)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Oblique Decision Tree Ensemble\n", - "odte = Odte(random_state=random_state, max_features=\"auto\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Do the test" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def try_model(name, model):\n", - " print(f\"************************** {name} **********************\")\n", - " now = time.time()\n", - " model.fit(Xtrain, ytrain)\n", - " spent = time.time() - now\n", - " print(f\"Train Model {name} took: {spent:.4} seconds\")\n", - " predict = model.predict(Xtrain)\n", - " predictt = model.predict(Xtest)\n", - " print(f\"=========== {name} - Train {Xtrain.shape[0]:,} samples =============\",)\n", - " print(classification_report(ytrain, predict, digits=6))\n", - " print(f\"=========== {name} - Test {Xtest.shape[0]:,} samples =============\")\n", - " print(classification_report(ytest, predictt, digits=6))\n", - " print(\"Confusion Matrix in Train\")\n", - " print(confusion_matrix(ytrain, predict))\n", - " print(\"Confusion Matrix in Test\")\n", - " print(confusion_matrix(ytest, predictt))\n", - " return f1_score(ytest, predictt), spent" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Train & Test models\n", - "models = {\n", - " 'Linear Tree':linear_tree, 'Random Forest': random_forest, 'Stree (SVM Tree)': stree, \n", - " 'AdaBoost model': adaboost, 'Odte model': odte #'Gradient Boost.': gradient\n", - "}\n", - "\n", - "best_f1 = 0\n", - "outcomes = []\n", - "for name, model in models.items():\n", - " f1, time_spent = try_model(name, model)\n", - " outcomes.append((name, f1, time_spent))\n", - " if f1 > best_f1:\n", - " best_model = name\n", - " best_time = time_spent\n", - " best_f1 = f1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(\"*\"*110)\n", - "print(f\"*The best f1 model is {best_model}, with a f1 score: {best_f1:.4} in {best_time:.6} seconds with {train_size:,} samples in train dataset\")\n", - "print(\"*\"*110)\n", - "for name, f1, time_spent in outcomes:\n", - " print(f\"Model: {name}\\t Time: {time_spent:6.2f} seconds\\t f1: {f1:.4}\")" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "**************************************************************************************************************\n", - "*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n", - "**************************************************************************************************************\n", - "Model: Linear Tree\t Time: 13.52 seconds\t f1: 0.7645\n", - "Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n", - "Model: Stree (SVM Tree)\t Time: 32.55 seconds\t f1: 0.8603\n", - "Model: AdaBoost model\t Time: 47.34 seconds\t f1: 0.7509\n", - "Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```\n", - "******************************************************************************************************************\n", - "*The best f1 model is Random Forest, with a f1 score: 0.8815 in 218.966 seconds with 0.7 samples in train dataset\n", - "******************************************************************************************************************\n", - "Model: Linear Tree Time: 23.05 seconds\t f1: 0.7645\n", - "Model: Random Forest\t Time: 218.97 seconds\t f1: 0.8815\n", - "Model: Stree (SVM Tree)\t Time: 49.45 seconds\t f1: 0.8467\n", - "Model: AdaBoost model\t Time: 73.83 seconds\t f1: 0.7509\n", - "Model: Gradient Boost.\t Time: 388.69 seconds\t f1: 0.5259\n", - "Model: Neural Network\t Time: 25.47 seconds\t f1: 0.8328\n", - "Model: Odte \t Time:2134.25 seconds\t f1: 0.8385\n", - "```" - ] - } - ], - "metadata": { - "hide_input": false, - "kernelspec": { - "display_name": "Python 3.7.6 64-bit ('general': venv)", - "language": "python", - "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6-final" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - }, - "varInspector": { - "cols": { - "lenName": 16, - "lenType": 16, - "lenVar": 40 - }, - "kernels_config": { - "python": { - "delete_cmd_postfix": "", - "delete_cmd_prefix": "del ", - "library": "var_list.py", - "varRefreshCmd": "print(var_dic_list())" - }, - "r": { - "delete_cmd_postfix": ") ", - "delete_cmd_prefix": "rm(", - "library": "var_list.r", - "varRefreshCmd": "cat(var_dic_list()) " - } - }, - "position": { - "height": "392px", - "left": "1518px", - "right": "20px", - "top": "40px", - "width": "392px" - }, - "types_to_exclude": [ - "module", - "function", - "builtin_function_or_method", - "instance", - "_Feature" - ], - "window_display": true - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/wine_iris.ipynb b/notebooks/wine_iris.ipynb deleted file mode 100644 index c5d8883..0000000 --- a/notebooks/wine_iris.ipynb +++ /dev/null @@ -1,174 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import datetime, time\n", - "import numpy as np\n", - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split, cross_validate\n", - "from sklearn import tree\n", - "from sklearn.metrics import classification_report, confusion_matrix, f1_score\n", - "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier\n", - "from stree import Stree\n", - "from odte import Odte\n", - "\n", - "random_state = 1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import load_wine\n", - "X, y = load_wine(return_X_y=True)\n", - "Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_estimators = 20\n", - "clf = {}\n", - "clf[\"stree\"] = Stree(random_state=random_state, max_depth=5)\n", - "clf[\"stree\"].set_params(**dict(splitter=\"best\", kernel=\"linear\", max_features=\"auto\"))\n", - "clf[\"odte\"] = Odte(n_jobs=-1, estimator=clf[\"stree\"], random_state=random_state, n_estimators=n_estimators, max_features=.8)\n", - "clf[\"adaboost\"] = AdaBoostClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n", - "clf[\"bagging\"] = BaggingClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(\"*\"*30,\"Results for wine\", \"*\"*30)\n", - "for clf_type, item in clf.items():\n", - " print(f\"Training {clf_type}...\")\n", - " now = time.time()\n", - " item.fit(Xtrain, ytrain)\n", - " print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import load_iris\n", - "X, y = load_iris(return_X_y=True)\n", - "Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_estimators = 10\n", - "clf = {}\n", - "clf[\"stree\"] = Stree(random_state=random_state, max_depth=3)\n", - "clf[\"odte\"] = Odte(n_jobs=-1, random_state=random_state, n_estimators=n_estimators, max_features=1.0)\n", - "clf[\"adaboost\"] = AdaBoostClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n", - "clf[\"bagging\"] = BaggingClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(\"*\"*30,\"Results for iris\", \"*\"*30)\n", - "for clf_type, item in clf.items():\n", - " print(f\"Training {clf_type}...\")\n", - " now = time.time()\n", - " item.fit(Xtrain, ytrain)\n", - " print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "cross = cross_validate(estimator=clf[\"odte\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n", - "print(cross)\n", - "print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "cross = cross_validate(estimator=clf[\"adaboost\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n", - "print(cross)\n", - "print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from sklearn.utils.estimator_checks import check_estimator\n", - "# Make checks one by one\n", - "c = 0\n", - "checks = check_estimator(Odte(), generate_only=True)\n", - "for check in checks:\n", - " c += 1\n", - " print(c, check[1])\n", - " check[1](check[0])" - ] - } - ], - "metadata": { - "interpreter": { - "hash": "da86226729227d0e8962a5ec29ea906307507ca2c30ceaaf651c09a617630939" - }, - "kernelspec": { - "display_name": "Python 3.9.2 64-bit ('general': venv)", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - }, - "orig_nbformat": 2 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/odte/Odte.py b/odte/Odte.py index 4022f5f..a48bc26 100644 --- a/odte/Odte.py +++ b/odte/Odte.py @@ -33,10 +33,10 @@ class Odte(BaseEnsemble, ClassifierMixin): # n_jobs = -1 to use all available cores n_jobs: int = -1, estimator: BaseEstimator = Stree(), - random_state: int = 0, + random_state: Optional[int] = None, max_features: Optional[Union[str, int, float]] = None, max_samples: Optional[Union[int, float]] = None, - n_estimators: int = 10, + n_estimators: int = 100, be_hyperparams: str = "{}", ): super().__init__( @@ -62,7 +62,10 @@ class Odte(BaseEnsemble, ClassifierMixin): ) def fit( - self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray = None + self, + X: np.ndarray, + y: np.ndarray, + sample_weight: Optional[np.ndarray] = None, ) -> Odte: # Check parameters are Ok. if self.n_estimators < 3: @@ -100,9 +103,6 @@ class Odte(BaseEnsemble, ClassifierMixin): tdepth += depth tnodes += nodes tleaves += leaves - # self.depth_ = tdepth / self.n_estimators - # self.leaves_ = tleaves / self.n_estimators - # self.nodes_ = tnodes / self.n_estimators self.depth_ = tdepth self.leaves_ = tleaves self.nodes_ = tnodes @@ -113,6 +113,11 @@ class Odte(BaseEnsemble, ClassifierMixin): n_samples = X.shape[0] boot_samples = self._get_bootstrap_n_samples(n_samples) estimator = clone(self.estimator_) + defined_state = ( + random.randint(0, 2**31) + if self.random_state is None + else self.random_state + ) return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore delayed(Odte._parallel_build_tree)( estimator, @@ -125,7 +130,7 @@ class Odte(BaseEnsemble, ClassifierMixin): self.be_hyperparams, ) for random_seed in range( - self.random_state, self.random_state + self.n_estimators + defined_state, defined_state + self.n_estimators ) ) diff --git a/odte/__init__.py b/odte/__init__.py index 27f5d1b..d4762e5 100644 --- a/odte/__init__.py +++ b/odte/__init__.py @@ -1,3 +1,4 @@ +from ._version import __version__ from .Odte import Odte __author__ = "Ricardo Montañana Gómez" @@ -5,4 +6,4 @@ __copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez" __license__ = "MIT License" __author_email__ = "ricardo.montanana@alu.uclm.es" -__all__ = ["Odte"] +__all__ = ["__version__", "Odte"] diff --git a/odte/_version.py b/odte/_version.py index d7b30e1..5becc17 100644 --- a/odte/_version.py +++ b/odte/_version.py @@ -1 +1 @@ -__version__ = "0.3.6" +__version__ = "1.0.0" diff --git a/odte/tests/Odte_tests.py b/odte/tests/Odte_tests.py index 79763a4..ce4a6f8 100644 --- a/odte/tests/Odte_tests.py +++ b/odte/tests/Odte_tests.py @@ -180,7 +180,7 @@ class Odte_test(unittest.TestCase): warnings.filterwarnings("ignore", category=RuntimeWarning) from sklearn.utils.estimator_checks import check_estimator - check_estimator(Odte()) + check_estimator(Odte(n_estimators=10)) def test_nodes_leaves_not_fitted(self): tclf = Odte( diff --git a/odte/tests/__init__.py b/odte/tests/__init__.py index 8f20244..b76dda9 100644 --- a/odte/tests/__init__.py +++ b/odte/tests/__init__.py @@ -1,4 +1,3 @@ -# type: ignore from .Odte_tests import Odte_test __all__ = ["Odte_test"] diff --git a/pyproject.toml b/pyproject.toml index 9bd6669..1689ca4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,65 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "Odte" +description = "Oblique decision tree Ensemble." +readme = "README.md" +license = { file = "LICENSE" } +authors = [ + { name = "Ricardo Montañana", email = "ricardo.montanana@alu.uclm.es" }, +] +dynamic = ['version'] +dependencies = ["stree>=1.4"] +requires-python = ">=3.11" +keywords = [ + "scikit-learn", + "oblique-classifier", + "oblique-decision-tree", + "decision-tree", + "ensemble", + "svm", +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "Topic :: Software Development", + "Topic :: Scientific/Engineering", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +[project.optional-dependencies] +dev = ["black", "flake8", "coverage", "mypy", "pandas", "hatch", "pip-audit"] +doc = ["sphinx", "myst-parser", "sphinx_rtd_theme", "sphinx-autodoc-typehints"] + +[project.urls] +Home = "https://github.com/doctorado-ml/odte" +Docs = "https://odte.readthedocs.io/en/latest/index.html" + +[tool.hatch.version] +path = "odte/_version.py" + +[tool.mypy] +exclude = ['tests'] + +[tool.coverage.run] +branch = true +source = ["odte"] +command_line = "-m unittest discover -s odte.tests" + +[tool.coverage.report] +show_missing = true +fail_under = 100 + [tool.black] line-length = 79 +target_version = ['py311'] include = '\.pyi?$' exclude = ''' /( @@ -13,4 +73,4 @@ exclude = ''' | build | dist )/ -''' \ No newline at end of file +''' diff --git a/requirements.txt b/requirements.txt index a8e5409..334d94e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -stree>=1.3.1 +stree>=1.4 diff --git a/setup.py b/setup.py deleted file mode 100644 index 2cab1c0..0000000 --- a/setup.py +++ /dev/null @@ -1,48 +0,0 @@ -import setuptools -import os - - -def readme(): - with open("README.md") as f: - return f.read() - - -def get_data(field): - item = "" - file_name = "_version.py" if field == "version" else "__init__.py" - with open(os.path.join("odte", file_name)) as f: - for line in f.readlines(): - if line.startswith(f"__{field}__"): - delim = '"' if '"' in line else "'" - item = line.split(delim)[1] - break - else: - raise RuntimeError(f"Unable to find {field} string.") - return item - - -setuptools.setup( - name="Odte", - version=get_data("version"), - license=get_data("license"), - description="Oblique decision tree Ensemble", - long_description=readme(), - long_description_content_type="text/markdown", - packages=setuptools.find_packages(), - url="https://github.com/doctorado-ml/odte", - author=get_data("author"), - author_email=get_data("author_email"), - keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\ - tree ensemble svm svc", - classifiers=[ - "Development Status :: 4 - Beta", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3.8", - "Natural Language :: English", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "Intended Audience :: Science/Research", - ], - install_requires=["stree"], - test_suite="odte.tests", - zip_safe=False, -)