Compare commits

...

40 Commits

Author SHA1 Message Date
0f4ea8542e
Merge branch 'master' of github.com:Doctorado-ML/odte 2025-05-20 11:42:39 +02:00
0fa37dfdf3
Fix scikit-learn version in requirements for tests 2025-05-20 11:42:00 +02:00
582ff44cb1
Update return types in nodes_leaves 2025-04-09 11:19:37 +02:00
66fe0bbe48
Add build 1 to version 2024-09-03 18:15:04 +02:00
1d906e2cb3
Fix doc link in project 2024-09-03 18:08:56 +02:00
Ricardo Montañana Gómez
31a3753046
Merge pull request #9 from Doctorado-ML/v1.0
Update doc config
2024-08-16 18:57:53 +02:00
3c0f03d27e
Add hyperparameters.md to doc 2024-08-16 16:33:28 +02:00
d644031f3f
Add hyperparameters.md to doc 2024-08-16 16:33:04 +02:00
1790f7fee7
Update readme 2024-08-15 13:47:30 +02:00
624f79af0d
Update readthedocs config 2024-08-15 13:30:19 +02:00
40031b7d52
Update readme 2024-08-15 13:23:20 +02:00
bcc763e656
Update doc config
Change build tool to hatch
2024-08-15 13:16:51 +02:00
b19264b1eb
Set default estimator to STree 2024-07-28 18:30:17 +02:00
02e75b3c3e
Fix depth/leaves/nodes no longer return average 2023-11-27 13:53:15 +01:00
52d1095161
Add separate methods to return nodes/leaves/depth 2023-11-27 10:33:47 +01:00
f9b83adfee
Change default n_estimators to 10 instead of 100 2023-10-10 09:34:43 +02:00
Ricardo Montañana Gómez
382a420791
ci: ⬆️ Upgrade github actions for codeql and README to add badge 2023-01-15 01:58:19 +01:00
Ricardo Montañana Gómez
7aa4156e51
ci: ⬆️ Update github action setup-python 2023-01-15 01:50:41 +01:00
Ricardo Montañana Gómez
0df2f243a5
ci: ⬆️ Upgrade github actions 2023-01-15 01:45:39 +01:00
Ricardo Montañana Gómez
d3ceb3ce46
refactor: 🔖 Update requirements and version info 2023-01-15 01:32:26 +01:00
Ricardo Montañana Gómez
cabf926eb1
Update to scikit-learn 1.2 2023-01-14 21:38:11 +01:00
Ricardo Montañana Gómez
7300bd66db
Merge pull request #8 from Doctorado-ML/fix_python_random_init
Fix python random init
2022-04-29 10:22:33 +02:00
114f53d5e8
Update version file 2022-04-29 10:07:05 +02:00
267a17a708
Remove unneeded Random module from tests
Update pre-commit config
2022-04-20 11:25:45 +02:00
e01ca43cf9
Fix python random init 2022-03-10 13:17:56 +01:00
Ricardo Montañana Gómez
98cadc7eeb
Merge pull request #6 from Doctorado-ML/parallel_init
Parallel init error
2022-03-02 13:12:13 +01:00
dda3517090
merge two tests parallel-sequential 2022-02-26 11:30:12 +01:00
877c24f3f4
fix rc1 2022-02-25 19:24:44 +01:00
9e5fe8c791
Fix flake req. remove uneeded sys mod 2022-02-23 12:10:12 +01:00
3766886190
Fix np.random initialization 2022-02-23 12:02:59 +01:00
cd7c7f3938
First try to fix initialization issue 2022-02-22 20:40:35 +01:00
aff96bb97d
Fix github actions lint mistake 2022-01-11 13:13:29 +01:00
cdaf7ecff2
Merge branch 'master' of github.com:doctorado-ml/odte 2022-01-11 12:57:15 +01:00
42bcae9736
Add audit and devdeps to Makefile 2022-01-11 12:52:36 +01:00
7481af3068
Add version info to model 2021-12-17 13:47:25 +01:00
Ricardo Montañana Gómez
2ebec2d588
Merge pull request #4 from Doctorado-ML/be_hyperparams
Add base estimator hyperparameters
2021-11-24 13:10:41 +01:00
67424e06be
Add python versions 3.9 & 3.10 to github actions 2021-11-24 12:54:25 +01:00
525ee93fc3
(#3)Add base estimator hyperparameters 2021-11-24 12:34:36 +01:00
74343a15e1
Fix nodes_leaves for base_estimator 2021-11-24 10:50:19 +01:00
3558c946a8
Update doi in README and CITATION 2021-11-22 16:55:17 +01:00
28 changed files with 496 additions and 781 deletions

View File

@ -2,12 +2,12 @@ name: "CodeQL"
on: on:
push: push:
branches: [ master ] branches: [master]
pull_request: pull_request:
# The branches below must be a subset of the branches above # The branches below must be a subset of the branches above
branches: [ master ] branches: [master]
schedule: schedule:
- cron: '16 17 * * 3' - cron: "16 17 * * 3"
jobs: jobs:
analyze: analyze:
@ -17,40 +17,40 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
language: [ 'python' ] language: ["python"]
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
# Learn more: # Learn more:
# https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@v2 uses: actions/checkout@v2
# Initializes the CodeQL tools for scanning. # Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL - name: Initialize CodeQL
uses: github/codeql-action/init@v1 uses: github/codeql-action/init@v2
with: with:
languages: ${{ matrix.language }} languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file. # If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file. # By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file. # Prefix the list here with "+" to use these queries and those in the config file.
# queries: ./path/to/local/query, your-org/your-repo/queries@main # queries: ./path/to/local/query, your-org/your-repo/queries@main
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below) # If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild - name: Autobuild
uses: github/codeql-action/autobuild@v1 uses: github/codeql-action/autobuild@v2
# Command-line programs to run using the OS shell. # Command-line programs to run using the OS shell.
# 📚 https://git.io/JvXDl # 📚 https://git.io/JvXDl
# ✏️ If the Autobuild fails above, remove it and uncomment the following three lines # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
# and modify them (or add more) to build your code if your project # and modify them (or add more) to build your code if your project
# uses a compiled language # uses a compiled language
#- run: | #- run: |
# make bootstrap # make bootstrap
# make release # make release
- name: Perform CodeQL Analysis - name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v1 uses: github/codeql-action/analyze@v2

View File

@ -13,12 +13,12 @@ jobs:
strategy: strategy:
matrix: matrix:
os: [macos-latest, ubuntu-latest, windows-latest] os: [macos-latest, ubuntu-latest, windows-latest]
python: [3.8] python: [3.11, 3.12]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python }} - name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v2 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python }} python-version: ${{ matrix.python }}
- name: Install dependencies - name: Install dependencies
@ -35,7 +35,7 @@ jobs:
coverage run -m unittest -v odte.tests coverage run -m unittest -v odte.tests
coverage xml coverage xml
- name: Upload coverage to Codecov - name: Upload coverage to Codecov
uses: codecov/codecov-action@v1 uses: codecov/codecov-action@v4
with: with:
token: ${{ secrets.CODECOV_TOKEN }} token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml files: ./coverage.xml

View File

@ -1,23 +1,23 @@
repos: repos:
- repo: https://github.com/ambv/black - repo: https://github.com/ambv/black
rev: 20.8b1 rev: 22.3.0
hooks: hooks:
- id: black - id: black
exclude: ".virtual_documents" exclude: ".virtual_documents"
language_version: python3.9 language_version: python3.9
- repo: https://gitlab.com/pycqa/flake8 - repo: https://gitlab.com/pycqa/flake8
rev: 3.8.4 rev: 3.9.2
hooks: hooks:
- id: flake8 - id: flake8
exclude: ".virtual_documents" exclude: ".virtual_documents"
- repo: https://github.com/pre-commit/mirrors-mypy - repo: https://github.com/pre-commit/mirrors-mypy
rev: "v0.790" # Use the sha / tag you want to point at rev: "v0.942" # Use the sha / tag you want to point at
hooks: hooks:
- id: mypy - id: mypy
#args: [--strict, --ignore-missing-imports] #args: [--strict, --ignore-missing-imports]
exclude: odte/tests exclude: odte/tests
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.4.0 rev: v4.2.0
hooks: hooks:
- id: trailing-whitespace - id: trailing-whitespace
- id: check-case-conflict - id: check-case-conflict

14
.readthedocs.yaml Normal file
View File

@ -0,0 +1,14 @@
version: 2
sphinx:
configuration: docs/source/conf.py
build:
os: ubuntu-22.04
tools:
python: "3.12"
python:
install:
- requirements: requirements.txt
- requirements: docs/requirements.txt

View File

@ -6,6 +6,6 @@ authors:
orcid: "https://orcid.org/0000-0003-3242-5452" orcid: "https://orcid.org/0000-0003-3242-5452"
title: "Odte" title: "Odte"
version: 0.3.1 version: 0.3.1
doi: 10.5281/zenodo.5504083 doi: 10.5281/zenodo.5718701
date-released: 2021-11-02 date-released: 2021-11-02
url: "https://github.com/Doctorado-ML/odte" url: "https://github.com/Doctorado-ML/odte"

1
MANIFEST.in Normal file
View File

@ -0,0 +1 @@
include README.md LICENSE

View File

@ -1,38 +1,35 @@
SHELL := /bin/bash SHELL := /bin/bash
.DEFAULT_GOAL := help .DEFAULT_GOAL := help
.PHONY: coverage deps help lint push test doc build .PHONY: audit coverage help lint test doc doc-clean build
coverage: ## Run tests with coverage coverage: ## Run tests with coverage
coverage erase @coverage erase
coverage run -m unittest -v odte.tests @coverage run -m unittest -v odte.tests
coverage report -m @coverage report -m
deps: ## Install dependencies lint: ## Lint source files
pip install -r requirements.txt @black odte
@flake8 odte
@mypy odte
lint: ## Lint and static-check audit: ## Audit pip
black odte @pip-audit
flake8 odte
mypy odte --exclude tests
push: ## Push code with tags
git push && git push --tags
test: ## Run tests test: ## Run tests
python -m unittest -v odte.tests @python -m unittest -v odte.tests
doc: ## Update documentation doc: ## Update documentation
make -C docs --makefile=Makefile html @make -C docs --makefile=Makefile html
build: ## Build package build: ## Build package
rm -fr dist/* @rm -fr dist/*
rm -fr build/* @rm -fr build/*
python setup.py sdist bdist_wheel @hatch build
doc-clean: ## Update documentation doc-clean: ## Clean documentation folders
make -C docs --makefile=Makefile clean @make -C docs --makefile=Makefile clean
help: ## Show help message help: ## Show this help message
@IFS=$$'\n' ; \ @IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \ help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
printf "%s\n\n" "Usage: make [task]"; \ printf "%s\n\n" "Usage: make [task]"; \

View File

@ -1,10 +1,11 @@
# Odte
![CI](https://github.com/Doctorado-ML/Odte/workflows/CI/badge.svg) ![CI](https://github.com/Doctorado-ML/Odte/workflows/CI/badge.svg)
[![CodeQL](https://github.com/Doctorado-ML/Odte/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/Doctorado-ML/Odte/actions/workflows/codeql-analysis.yml)
[![codecov](https://codecov.io/gh/Doctorado-ML/odte/branch/master/graph/badge.svg)](https://codecov.io/gh/Doctorado-ML/odte) [![codecov](https://codecov.io/gh/Doctorado-ML/odte/branch/master/graph/badge.svg)](https://codecov.io/gh/Doctorado-ML/odte)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/f4b5ef87584b4095b6e49aefbe594c82)](https://www.codacy.com/gh/Doctorado-ML/Odte/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/Odte&utm_campaign=Badge_Grade) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/f4b5ef87584b4095b6e49aefbe594c82)](https://www.codacy.com/gh/Doctorado-ML/Odte/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/Odte&utm_campaign=Badge_Grade)
[![PyPI version](https://badge.fury.io/py/Odte.svg)](https://badge.fury.io/py/Odte) [![PyPI version](https://badge.fury.io/py/Odte.svg)](https://badge.fury.io/py/Odte)
![https://img.shields.io/badge/python-3.8%2B-blue](https://img.shields.io/badge/python-3.8%2B-brightgreen) ![https://img.shields.io/badge/python-3.11%2B-blue](https://img.shields.io/badge/python-3.11%2B-brightgreen)
doi [![DOI](https://zenodo.org/badge/271595804.svg)](https://zenodo.org/badge/latestdoi/271595804)
# Odte Oblique Decision Tree Ensemble classifier based on [STree](https://github.com/doctorado-ml/stree) nodes.
Oblique Decision Tree Ensemble

20
docs/Makefile Normal file
View File

@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

4
docs/requirements.txt Normal file
View File

@ -0,0 +1,4 @@
sphinx
sphinx-rtd-theme
myst-parser
stree

10
docs/source/api/Odte.rst Normal file
View File

@ -0,0 +1,10 @@
Odte
=====
.. automodule:: odte
.. autoclass:: Odte
:members:
:undoc-members:
:private-members:
:show-inheritance:
:noindex:

View File

@ -0,0 +1,8 @@
API index
=========
.. toctree::
:maxdepth: 2
:caption: Contents:
Odte

54
docs/source/conf.py Normal file
View File

@ -0,0 +1,54 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
import odte
# -- Project information -----------------------------------------------------
project = "Odte"
copyright = "2024 Ricardo Montañana Gómez"
author = "Ricardo Montañana Gómez"
# The full version, including alpha/beta/rc tags
version = release = odte.__version__
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ["myst_parser", "sphinx.ext.autodoc", "sphinx.ext.viewcode"]
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = []

BIN
docs/source/example.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.1 MiB

View File

@ -0,0 +1,13 @@
# Hyperparameters
| | **Hyperparameter** | **Type/Values** | **Default** | |
| --- | ------------------- | -------------------------------------------------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| | estimator | \<sklearn.BaseEstimator\> | Stree() | Base estimator used to build each element of the ensemble. |
| | n_jobs | \<int\> | -1 | Specifies the number of threads used to build the ensemble (-1 equals to all cores available) |
| | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider in each tree:<br>\<int\> _max_features_ features for each tree.<br>\<float\> _max_features_ is a fraction and int(_max_features_ \* _n_features_) features are considered for each tree.<br>“auto” _max_features_=sqrt(_n_features_)<br>“sqrt” _max_features_=sqrt(_n_features_)<br>“log2” _max_features_=log2(_n_features_)<br>_None_ _max_features_=_n_features_ |
| | max_samples | \<int\>, \<float\> | None |The number of samples to consider for bootstrap:<br>\<int\> _max_samples_ samples for each tree.<br>\<float\> _max_samples_ is a fraction and int(_max_samples_ \* _n_samples_) samples for each tree. |
| | n_estimators | \<int\> | 100 | The number of trees the ensemble is going to build |
| | be_hyperparams | \<str\> | "{}" | Hyperparameteres passed to the base estimator, i.e. "{\\"C\\": 17, \\"kernel\\": \\"rbf\\"}"|

14
docs/source/index.rst Normal file
View File

@ -0,0 +1,14 @@
Welcome to Odte's documentation!
=================================
.. toctree::
:caption: Contents:
:titlesonly:
odte
install
hyperparameters
api/index
* :ref:`genindex`

15
docs/source/install.rst Normal file
View File

@ -0,0 +1,15 @@
Install
=======
The main stable release
``pip install odte``
or the last development branch
``pip install git+https://github.com/doctorado-ml/odte``
Tests
*****
``python -m unittest -v odte.tests``

17
docs/source/odte.md Normal file
View File

@ -0,0 +1,17 @@
# Odte
![CI](https://github.com/Doctorado-ML/Odte/workflows/CI/badge.svg)
[![CodeQL](https://github.com/Doctorado-ML/Odte/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/Doctorado-ML/Odte/actions/workflows/codeql-analysis.yml)
[![codecov](https://codecov.io/gh/Doctorado-ML/odte/branch/master/graph/badge.svg)](https://codecov.io/gh/Doctorado-ML/odte)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/f4b5ef87584b4095b6e49aefbe594c82)](https://www.codacy.com/gh/Doctorado-ML/Odte/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/Odte&utm_campaign=Badge_Grade)
[![PyPI version](https://badge.fury.io/py/Odte.svg)](https://badge.fury.io/py/Odte)
![https://img.shields.io/badge/python-3.11%2B-blue](https://img.shields.io/badge/python-3.11%2B-brightgreen)
[![DOI](https://zenodo.org/badge/271595804.svg)](https://zenodo.org/badge/latestdoi/271595804)
Oblique Decision Tree Ensemble classifier based on [STree](https://github.com/doctorado-ml/stree) nodes.
![Odte](./example.png)
## License
Odte is [MIT](https://github.com/doctorado-ml/odte/blob/master/LICENSE) licensed

View File

@ -1,388 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Compare Odte with different estimators"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup\n",
"Uncomment the next cell if Odte is not already installed"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# Google Colab setup\n",
"#\n",
"#!pip install git+https://github.com/doctorado-ml/odte\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import datetime, time\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import tree\n",
"from sklearn.metrics import classification_report, confusion_matrix, f1_score\n",
"from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier\n",
"from stree import Stree\n",
"from odte import Odte"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.isfile('data/creditcard.csv'):\n",
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
" !tar xzf creditcard.tgz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tests"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"print(datetime.date.today(), time.strftime(\"%H:%M:%S\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load dataset and normalize values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load Dataset\n",
"df = pd.read_csv('data/creditcard.csv')\n",
"df.shape\n",
"random_state = 2020"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
"print(\"Valid: {0:.3f}% {1:,}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Normalize Amount\n",
"from sklearn.preprocessing import RobustScaler\n",
"values = RobustScaler().fit_transform(df.Amount.values.reshape(-1, 1))\n",
"df['Amount_Scaled'] = values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Remove unneeded features\n",
"y = df.Class.values\n",
"X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
"print(f\"X shape: {X.shape}\\ny shape: {y.shape}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Build the models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Divide dataset\n",
"train_size = .7\n",
"Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size, shuffle=True, random_state=random_state, stratify=y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Linear Tree\n",
"linear_tree = tree.DecisionTreeClassifier(random_state=random_state)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Random Forest\n",
"random_forest = RandomForestClassifier(random_state=random_state)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Stree\n",
"stree = Stree(random_state=random_state, C=.01)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# AdaBoost\n",
"adaboost = AdaBoostClassifier(random_state=random_state)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Gradient Boosting\n",
"gradient = GradientBoostingClassifier(random_state=random_state)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Oblique Decision Tree Ensemble\n",
"odte = Odte(random_state=random_state, max_features=\"auto\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Do the test"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def try_model(name, model):\n",
" print(f\"************************** {name} **********************\")\n",
" now = time.time()\n",
" model.fit(Xtrain, ytrain)\n",
" spent = time.time() - now\n",
" print(f\"Train Model {name} took: {spent:.4} seconds\")\n",
" predict = model.predict(Xtrain)\n",
" predictt = model.predict(Xtest)\n",
" print(f\"=========== {name} - Train {Xtrain.shape[0]:,} samples =============\",)\n",
" print(classification_report(ytrain, predict, digits=6))\n",
" print(f\"=========== {name} - Test {Xtest.shape[0]:,} samples =============\")\n",
" print(classification_report(ytest, predictt, digits=6))\n",
" print(\"Confusion Matrix in Train\")\n",
" print(confusion_matrix(ytrain, predict))\n",
" print(\"Confusion Matrix in Test\")\n",
" print(confusion_matrix(ytest, predictt))\n",
" return f1_score(ytest, predictt), spent"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Train & Test models\n",
"models = {\n",
" 'Linear Tree':linear_tree, 'Random Forest': random_forest, 'Stree (SVM Tree)': stree, \n",
" 'AdaBoost model': adaboost, 'Odte model': odte #'Gradient Boost.': gradient\n",
"}\n",
"\n",
"best_f1 = 0\n",
"outcomes = []\n",
"for name, model in models.items():\n",
" f1, time_spent = try_model(name, model)\n",
" outcomes.append((name, f1, time_spent))\n",
" if f1 > best_f1:\n",
" best_model = name\n",
" best_time = time_spent\n",
" best_f1 = f1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"print(\"*\"*110)\n",
"print(f\"*The best f1 model is {best_model}, with a f1 score: {best_f1:.4} in {best_time:.6} seconds with {train_size:,} samples in train dataset\")\n",
"print(\"*\"*110)\n",
"for name, f1, time_spent in outcomes:\n",
" print(f\"Model: {name}\\t Time: {time_spent:6.2f} seconds\\t f1: {f1:.4}\")"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"**************************************************************************************************************\n",
"*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n",
"**************************************************************************************************************\n",
"Model: Linear Tree\t Time: 13.52 seconds\t f1: 0.7645\n",
"Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n",
"Model: Stree (SVM Tree)\t Time: 32.55 seconds\t f1: 0.8603\n",
"Model: AdaBoost model\t Time: 47.34 seconds\t f1: 0.7509\n",
"Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```\n",
"******************************************************************************************************************\n",
"*The best f1 model is Random Forest, with a f1 score: 0.8815 in 218.966 seconds with 0.7 samples in train dataset\n",
"******************************************************************************************************************\n",
"Model: Linear Tree Time: 23.05 seconds\t f1: 0.7645\n",
"Model: Random Forest\t Time: 218.97 seconds\t f1: 0.8815\n",
"Model: Stree (SVM Tree)\t Time: 49.45 seconds\t f1: 0.8467\n",
"Model: AdaBoost model\t Time: 73.83 seconds\t f1: 0.7509\n",
"Model: Gradient Boost.\t Time: 388.69 seconds\t f1: 0.5259\n",
"Model: Neural Network\t Time: 25.47 seconds\t f1: 0.8328\n",
"Model: Odte \t Time:2134.25 seconds\t f1: 0.8385\n",
"```"
]
}
],
"metadata": {
"hide_input": false,
"kernelspec": {
"display_name": "Python 3.7.6 64-bit ('general': venv)",
"language": "python",
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6-final"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"position": {
"height": "392px",
"left": "1518px",
"right": "20px",
"top": "40px",
"width": "392px"
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,174 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import datetime, time\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split, cross_validate\n",
"from sklearn import tree\n",
"from sklearn.metrics import classification_report, confusion_matrix, f1_score\n",
"from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier\n",
"from stree import Stree\n",
"from odte import Odte\n",
"\n",
"random_state = 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_wine\n",
"X, y = load_wine(return_X_y=True)\n",
"Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"n_estimators = 20\n",
"clf = {}\n",
"clf[\"stree\"] = Stree(random_state=random_state, max_depth=5)\n",
"clf[\"stree\"].set_params(**dict(splitter=\"best\", kernel=\"linear\", max_features=\"auto\"))\n",
"clf[\"odte\"] = Odte(n_jobs=-1, base_estimator=clf[\"stree\"], random_state=random_state, n_estimators=n_estimators, max_features=.8)\n",
"clf[\"adaboost\"] = AdaBoostClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
"clf[\"bagging\"] = BaggingClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"print(\"*\"*30,\"Results for wine\", \"*\"*30)\n",
"for clf_type, item in clf.items():\n",
" print(f\"Training {clf_type}...\")\n",
" now = time.time()\n",
" item.fit(Xtrain, ytrain)\n",
" print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_iris\n",
"X, y = load_iris(return_X_y=True)\n",
"Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"n_estimators = 10\n",
"clf = {}\n",
"clf[\"stree\"] = Stree(random_state=random_state, max_depth=3)\n",
"clf[\"odte\"] = Odte(n_jobs=-1, random_state=random_state, n_estimators=n_estimators, max_features=1.0)\n",
"clf[\"adaboost\"] = AdaBoostClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
"clf[\"bagging\"] = BaggingClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"print(\"*\"*30,\"Results for iris\", \"*\"*30)\n",
"for clf_type, item in clf.items():\n",
" print(f\"Training {clf_type}...\")\n",
" now = time.time()\n",
" item.fit(Xtrain, ytrain)\n",
" print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"cross = cross_validate(estimator=clf[\"odte\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n",
"print(cross)\n",
"print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"cross = cross_validate(estimator=clf[\"adaboost\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n",
"print(cross)\n",
"print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from sklearn.utils.estimator_checks import check_estimator\n",
"# Make checks one by one\n",
"c = 0\n",
"checks = check_estimator(Odte(), generate_only=True)\n",
"for check in checks:\n",
" c += 1\n",
" print(c, check[1])\n",
" check[1](check[0])"
]
}
],
"metadata": {
"interpreter": {
"hash": "da86226729227d0e8962a5ec29ea906307507ca2c30ceaaf651c09a617630939"
},
"kernelspec": {
"display_name": "Python 3.9.2 64-bit ('general': venv)",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
},
"orig_nbformat": 2
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -2,11 +2,13 @@
__author__ = "Ricardo Montañana Gómez" __author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez" __copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
__license__ = "MIT" __license__ = "MIT"
Build a forest of oblique trees based on STree Build a forest of oblique trees based on STree, admits any base classifier
as well
""" """
from __future__ import annotations from __future__ import annotations
import random import random
import sys import json
from math import factorial from math import factorial
from typing import Union, Optional, Tuple, List, Set from typing import Union, Optional, Tuple, List, Set
import numpy as np import numpy as np
@ -14,6 +16,7 @@ from sklearn.utils.multiclass import ( # type: ignore
check_classification_targets, check_classification_targets,
) )
from sklearn.base import clone, BaseEstimator, ClassifierMixin # type: ignore from sklearn.base import clone, BaseEstimator, ClassifierMixin # type: ignore
from sklearn.utils import check_random_state # type: ignore
from sklearn.ensemble import BaseEnsemble # type: ignore from sklearn.ensemble import BaseEnsemble # type: ignore
from sklearn.utils.validation import ( # type: ignore from sklearn.utils.validation import ( # type: ignore
check_is_fitted, check_is_fitted,
@ -21,44 +24,48 @@ from sklearn.utils.validation import ( # type: ignore
) )
from joblib import Parallel, delayed # type: ignore from joblib import Parallel, delayed # type: ignore
from stree import Stree # type: ignore from stree import Stree # type: ignore
from ._version import __version__
class Odte(BaseEnsemble, ClassifierMixin): class Odte(BaseEnsemble, ClassifierMixin):
def __init__( def __init__(
self, self,
# n_jobs = -1 to use all available cores # n_jobs = -1 to use all available cores
n_jobs: int = 1, n_jobs: int = -1,
base_estimator: BaseEstimator = None, estimator: BaseEstimator = Stree(),
random_state: int = 0, random_state: Optional[int] = None,
max_features: Optional[Union[str, int, float]] = None, max_features: Optional[Union[str, int, float]] = None,
max_samples: Optional[Union[int, float]] = None, max_samples: Optional[Union[int, float]] = None,
n_estimators: int = 100, n_estimators: int = 100,
be_hyperparams: str = "{}",
): ):
super().__init__( super().__init__(
base_estimator=base_estimator, estimator=estimator,
n_estimators=n_estimators, n_estimators=n_estimators,
) )
self.base_estimator = base_estimator self.estimator = estimator
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.n_estimators = n_estimators self.n_estimators = n_estimators
self.random_state = random_state self.random_state = random_state
self.max_features = max_features self.max_features = max_features
self.max_samples = max_samples # size of bootstrap self.max_samples = max_samples # size of bootstrap
self.be_hyperparams = be_hyperparams
def _initialize_random(self) -> np.random.mtrand.RandomState: @staticmethod
if self.random_state is None: def version() -> str:
self.random_state = random.randint(0, sys.maxsize) return __version__
return np.random.mtrand._rand
return np.random.RandomState(self.random_state)
def _validate_estimator(self) -> None: def _validate_estimator(self) -> None:
"""Check the estimator and set the base_estimator_ attribute.""" """Check the estimator and set the estimator_ attribute."""
super()._validate_estimator( super()._validate_estimator(
default=Stree(random_state=self.random_state) default=Stree(random_state=self.random_state)
) )
def fit( def fit(
self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray = None self,
X: np.ndarray,
y: np.ndarray,
sample_weight: Optional[np.ndarray] = None,
) -> Odte: ) -> Odte:
# Check parameters are Ok. # Check parameters are Ok.
if self.n_estimators < 3: if self.n_estimators < 3:
@ -76,7 +83,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
# Initialize computed parameters # Initialize computed parameters
# Build the estimator # Build the estimator
self.max_features_ = self._initialize_max_features() self.max_features_ = self._initialize_max_features()
# build base_estimator_ # build estimator_
self._validate_estimator() self._validate_estimator()
self.classes_, y = np.unique(y, return_inverse=True) self.classes_, y = np.unique(y, return_inverse=True)
self.n_classes_: int = self.classes_.shape[0] self.n_classes_: int = self.classes_.shape[0]
@ -90,29 +97,62 @@ class Odte(BaseEnsemble, ClassifierMixin):
def _compute_metrics(self) -> None: def _compute_metrics(self) -> None:
tdepth = tnodes = tleaves = 0 tdepth = tnodes = tleaves = 0
for estimator in self.estimators_: for estimator in self.estimators_:
nodes, leaves = estimator.nodes_leaves() if hasattr(estimator, "nodes_leaves"):
depth = estimator.depth_ nodes, leaves = estimator.nodes_leaves()
tdepth += depth depth = estimator.depth_
tnodes += nodes tdepth += depth
tleaves += leaves tnodes += nodes
self.depth_ = tdepth / self.n_estimators tleaves += leaves
self.leaves_ = tleaves / self.n_estimators self.depth_ = tdepth
self.nodes_ = tnodes / self.n_estimators self.leaves_ = tleaves
self.nodes_ = tnodes
def _train(
self, X: np.ndarray, y: np.ndarray, weights: np.ndarray
) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
n_samples = X.shape[0]
boot_samples = self._get_bootstrap_n_samples(n_samples)
estimator = clone(self.estimator_)
defined_state = (
random.randint(0, 2**31)
if self.random_state is None
else self.random_state
)
return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore
delayed(Odte._parallel_build_tree)(
estimator,
X,
y,
weights,
random_seed,
boot_samples,
self.max_features_,
self.be_hyperparams,
)
for random_seed in range(
defined_state, defined_state + self.n_estimators
)
)
@staticmethod @staticmethod
def _parallel_build_tree( def _parallel_build_tree(
base_estimator_: Stree, estimator_: BaseEstimator,
X: np.ndarray, X: np.ndarray,
y: np.ndarray, y: np.ndarray,
weights: np.ndarray, weights: np.ndarray,
random_box: np.random.mtrand.RandomState,
random_seed: int, random_seed: int,
boot_samples: int, boot_samples: int,
max_features: int, max_features: int,
hyperparams: str,
) -> Tuple[BaseEstimator, Tuple[int, ...]]: ) -> Tuple[BaseEstimator, Tuple[int, ...]]:
clf = clone(base_estimator_) clf = clone(estimator_)
clf.set_params(random_state=random_seed) hyperparams_ = json.loads(hyperparams)
hyperparams_.update(dict(random_state=random_seed))
clf.set_params(**hyperparams_)
n_samples = X.shape[0] n_samples = X.shape[0]
# initialize random boxes
random.seed(random_seed)
random_box = check_random_state(random_seed)
# bootstrap # bootstrap
indices = random_box.randint(0, n_samples, boot_samples) indices = random_box.randint(0, n_samples, boot_samples)
# update weights with the chosen samples # update weights with the chosen samples
@ -125,29 +165,6 @@ class Odte(BaseEnsemble, ClassifierMixin):
clf.fit(bootstrap[:, features], y[indices], current_weights[indices]) clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
return (clf, features) return (clf, features)
def _train(
self, X: np.ndarray, y: np.ndarray, weights: np.ndarray
) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
random_box = self._initialize_random()
n_samples = X.shape[0]
boot_samples = self._get_bootstrap_n_samples(n_samples)
clf = clone(self.base_estimator_)
return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore
delayed(Odte._parallel_build_tree)(
clf,
X,
y,
weights,
random_box,
random_seed,
boot_samples,
self.max_features_,
)
for random_seed in range(
self.random_state, self.random_state + self.n_estimators
)
)
def _get_bootstrap_n_samples(self, n_samples: int) -> int: def _get_bootstrap_n_samples(self, n_samples: int) -> int:
if self.max_samples is None: if self.max_samples is None:
return n_samples return n_samples
@ -243,6 +260,18 @@ class Odte(BaseEnsemble, ClassifierMixin):
result[i, predictions[i]] += 1 result[i, predictions[i]] += 1
return result / self.n_estimators return result / self.n_estimators
def nodes_leaves(self) -> Tuple[float, float]: def get_nodes(self) -> int:
check_is_fitted(self, "estimators_") check_is_fitted(self, "estimators_")
return self.nodes_, self.leaves_ return self.nodes_
def get_leaves(self) -> int:
check_is_fitted(self, "estimators_")
return self.leaves_
def get_depth(self) -> int:
check_is_fitted(self, "estimators_")
return self.depth_
def nodes_leaves(self) -> Tuple[int, int]:
check_is_fitted(self, "estimators_")
return (self.get_nodes(), self.get_leaves())

View File

@ -1,10 +1,9 @@
from ._version import __version__
from .Odte import Odte from .Odte import Odte
__version__ = "0.3.1" __author__ = "Ricardo Montañana Gómez"
__author__ = "Rica.rdo Montañana Gómez"
__copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez" __copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez"
__license__ = "MIT License" __license__ = "MIT License"
__author_email__ = "ricardo.montanana@alu.uclm.es" __author_email__ = "ricardo.montanana@alu.uclm.es"
__all__ = ["Odte"] __all__ = ["__version__", "Odte"]

1
odte/_version.py Normal file
View File

@ -0,0 +1 @@
__version__ = "1.0.0-1"

View File

@ -1,13 +1,14 @@
# type: ignore # type: ignore
import unittest import unittest
import os import os
import random
import warnings import warnings
import json
from sklearn.exceptions import ConvergenceWarning, NotFittedError from sklearn.exceptions import ConvergenceWarning, NotFittedError
from sklearn.svm import SVC
from odte import Odte from odte import Odte
from stree import Stree from stree import Stree
from .utils import load_dataset from .utils import load_dataset
from .._version import __version__
class Odte_test(unittest.TestCase): class Odte_test(unittest.TestCase):
@ -44,7 +45,10 @@ class Odte_test(unittest.TestCase):
) )
for max_features in [4, 0.4, 1.0, None, "auto", "sqrt", "log2"]: for max_features in [4, 0.4, 1.0, None, "auto", "sqrt", "log2"]:
tclf = Odte( tclf = Odte(
random_state=self._random_state, max_features=max_features random_state=self._random_state,
max_features=max_features,
n_jobs=1,
n_estimators=100,
) )
tclf.fit(X, y) tclf.fit(X, y)
computed = tclf._get_random_subspace(X, y, tclf.max_features_) computed = tclf._get_random_subspace(X, y, tclf.max_features_)
@ -52,20 +56,6 @@ class Odte_test(unittest.TestCase):
self.assertListEqual(expected, list(computed)) self.assertListEqual(expected, list(computed))
# print(f"{list(computed)},") # print(f"{list(computed)},")
def test_initialize_random(self):
expected = [37, 235, 908]
tclf = Odte(random_state=self._random_state)
box = tclf._initialize_random()
computed = box.randint(0, 1000, 3)
self.assertListEqual(expected, computed.tolist())
# test None
tclf = Odte(random_state=None)
box = tclf._initialize_random()
computed = box.randint(101, 1000, 3)
for value in computed.tolist():
self.assertGreaterEqual(value, 101)
self.assertLessEqual(value, 1000)
def test_bogus_max_features(self): def test_bogus_max_features(self):
values = ["duck", -0.1, 0.0] values = ["duck", -0.1, 0.0]
for max_features in values: for max_features in values:
@ -87,15 +77,15 @@ class Odte_test(unittest.TestCase):
X, y = [[1, 2], [5, 6], [9, 10], [16, 17]], [0, 1, 1, 2] X, y = [[1, 2], [5, 6], [9, 10], [16, 17]], [0, 1, 1, 2]
expected = [0, 1, 1, 2] expected = [0, 1, 1, 2]
tclf = Odte( tclf = Odte(
base_estimator=Stree(), estimator=Stree(),
random_state=self._random_state, random_state=self._random_state,
n_estimators=10, n_estimators=10,
n_jobs=-1, n_jobs=-1,
) )
tclf.set_params( tclf.set_params(
**dict( **dict(
base_estimator__kernel="rbf", estimator__kernel="rbf",
base_estimator__random_state=self._random_state, estimator__random_state=self._random_state,
) )
) )
computed = tclf.fit(X, y).predict(X) computed = tclf.fit(X, y).predict(X)
@ -107,14 +97,15 @@ class Odte_test(unittest.TestCase):
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)
expected = y expected = y
tclf = Odte( tclf = Odte(
base_estimator=Stree(), estimator=Stree(),
random_state=self._random_state, random_state=self._random_state,
max_features=1.0, max_features=1.0,
max_samples=0.1, max_samples=0.1,
n_estimators=100,
) )
tclf.set_params( tclf.set_params(
**dict( **dict(
base_estimator__kernel="linear", estimator__kernel="linear",
) )
) )
computed = tclf.fit(X, y).predict(X) computed = tclf.fit(X, y).predict(X)
@ -122,7 +113,7 @@ class Odte_test(unittest.TestCase):
def test_score(self): def test_score(self):
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)
expected = 0.9513333333333334 expected = 0.9533333333333334
tclf = Odte( tclf = Odte(
random_state=self._random_state, random_state=self._random_state,
max_features=None, max_features=None,
@ -134,21 +125,19 @@ class Odte_test(unittest.TestCase):
def test_score_splitter_max_features(self): def test_score_splitter_max_features(self):
X, y = load_dataset(self._random_state, n_features=16, n_samples=500) X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
results = [ results = [
0.948, 0.958, # best auto
0.924, 0.942, # random auto
0.926, 0.932, # trandom auto
0.94, 0.95, # mutual auto
0.932, 0.944, # iwss auto
0.936, 0.946, # cfs auto
0.962, 0.97, # best None
0.962, 0.97, # random None
0.962, 0.97, # trandom None
0.962, 0.97, # mutual None
0.962, 0.97, # iwss None
0.962, 0.97, # cfs None
0.962,
] ]
random.seed(self._random_state)
for max_features in ["auto", None]: for max_features in ["auto", None]:
for splitter in [ for splitter in [
"best", "best",
@ -159,21 +148,22 @@ class Odte_test(unittest.TestCase):
"cfs", "cfs",
]: ]:
tclf = Odte( tclf = Odte(
base_estimator=Stree(), estimator=Stree(),
random_state=self._random_state, random_state=self._random_state,
n_estimators=3, n_estimators=3,
n_jobs=1,
) )
tclf.set_params( tclf.set_params(
**dict( **dict(
base_estimator__max_features=max_features, estimator__max_features=max_features,
base_estimator__splitter=splitter, estimator__splitter=splitter,
base_estimator__random_state=self._random_state, estimator__random_state=self._random_state,
) )
) )
expected = results.pop(0) expected = results.pop(0)
computed = tclf.fit(X, y).score(X, y) computed = tclf.fit(X, y).score(X, y)
# print(computed, splitter, max_features) # print(computed, splitter, max_features)
self.assertAlmostEqual(expected, computed) self.assertAlmostEqual(expected, computed, msg=splitter)
def test_generate_subspaces(self): def test_generate_subspaces(self):
features = 250 features = 250
@ -190,28 +180,104 @@ class Odte_test(unittest.TestCase):
warnings.filterwarnings("ignore", category=RuntimeWarning) warnings.filterwarnings("ignore", category=RuntimeWarning)
from sklearn.utils.estimator_checks import check_estimator from sklearn.utils.estimator_checks import check_estimator
check_estimator(Odte()) check_estimator(Odte(n_estimators=10))
def test_nodes_leaves_not_fitted(self): def test_nodes_leaves_not_fitted(self):
tclf = Odte( tclf = Odte(
base_estimator=Stree(), estimator=Stree(),
random_state=self._random_state, random_state=self._random_state,
n_estimators=3, n_estimators=3,
) )
with self.assertRaises(NotFittedError): with self.assertRaises(NotFittedError):
tclf.nodes_leaves() tclf.nodes_leaves()
with self.assertRaises(NotFittedError):
tclf.get_nodes()
with self.assertRaises(NotFittedError):
tclf.get_leaves()
with self.assertRaises(NotFittedError):
tclf.get_depth()
def test_nodes_leaves_depth(self): def test_nodes_leaves_depth(self):
tclf = Odte( tclf = Odte(
base_estimator=Stree(), estimator=Stree(),
random_state=self._random_state,
n_estimators=5,
n_jobs=1,
)
tclf_p = Odte(
estimator=Stree(),
random_state=self._random_state,
n_estimators=5,
n_jobs=-1,
)
X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
tclf.fit(X, y)
tclf_p.fit(X, y)
for clf in [tclf, tclf_p]:
self.assertEqual(29, clf.depth_)
self.assertEqual(29, clf.get_depth())
self.assertEqual(47, clf.leaves_)
self.assertEqual(47, clf.get_leaves())
self.assertEqual(89, clf.nodes_)
self.assertEqual(89, clf.get_nodes())
nodes, leaves = clf.nodes_leaves()
self.assertEqual(47, leaves)
self.assertEqual(47, clf.get_leaves())
self.assertEqual(89, nodes)
self.assertEqual(89, clf.get_nodes())
def test_nodes_leaves_SVC(self):
tclf = Odte(
estimator=SVC(),
random_state=self._random_state, random_state=self._random_state,
n_estimators=3, n_estimators=3,
) )
X, y = load_dataset(self._random_state, n_features=16, n_samples=500) X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
tclf.fit(X, y) tclf.fit(X, y)
self.assertAlmostEqual(6.0, tclf.depth_) self.assertAlmostEqual(0.0, tclf.leaves_)
self.assertAlmostEqual(9.333333333333334, tclf.leaves_) self.assertAlmostEqual(0.0, tclf.get_leaves())
self.assertAlmostEqual(17.666666666666668, tclf.nodes_) self.assertAlmostEqual(0.0, tclf.nodes_)
self.assertAlmostEqual(0.0, tclf.get_nodes())
nodes, leaves = tclf.nodes_leaves() nodes, leaves = tclf.nodes_leaves()
self.assertAlmostEqual(9.333333333333334, leaves) self.assertAlmostEqual(0.0, leaves)
self.assertAlmostEqual(17.666666666666668, nodes) self.assertAlmostEqual(0.0, tclf.get_leaves())
self.assertAlmostEqual(0.0, nodes)
self.assertAlmostEqual(0.0, tclf.get_nodes())
def test_estimator_hyperparams(self):
data = [
(Stree(), {"max_features": 7, "max_depth": 2}),
(SVC(), {"kernel": "linear", "cache_size": 100}),
]
for clf, hyperparams in data:
hyperparams_ = json.dumps(hyperparams)
tclf = Odte(
estimator=clf,
random_state=self._random_state,
n_estimators=3,
be_hyperparams=hyperparams_,
)
self.assertEqual(hyperparams_, tclf.be_hyperparams)
X, y = load_dataset(
self._random_state, n_features=16, n_samples=500
)
tclf.fit(X, y)
for estimator in tclf.estimators_:
for key, value in hyperparams.items():
self.assertEqual(value, estimator.get_params()[key])
def test_version(self):
tclf = Odte()
self.assertEqual(__version__, tclf.version())
def test_parallel_score(self):
tclf_p = Odte(
n_jobs=-1, random_state=self._random_state, n_estimators=30
)
tclf_s = Odte(
n_jobs=1, random_state=self._random_state, n_estimators=30
)
X, y = load_dataset(self._random_state, n_features=56, n_samples=1500)
tclf_p.fit(X, y)
tclf_s.fit(X, y)
self.assertAlmostEqual(tclf_p.score(X, y), tclf_s.score(X, y))

View File

@ -1,4 +1,3 @@
# type: ignore
from .Odte_tests import Odte_test from .Odte_tests import Odte_test
__all__ = ["Odte_test"] __all__ = ["Odte_test"]

View File

@ -1,5 +1,65 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "Odte"
description = "Oblique decision tree Ensemble."
readme = "README.md"
license = { file = "LICENSE" }
authors = [
{ name = "Ricardo Montañana", email = "ricardo.montanana@alu.uclm.es" },
]
dynamic = ['version']
dependencies = ["stree>=1.4"]
requires-python = ">=3.11"
keywords = [
"scikit-learn",
"oblique-classifier",
"oblique-decision-tree",
"decision-tree",
"ensemble",
"svm",
]
classifiers = [
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Science/Research",
"Intended Audience :: Developers",
"Topic :: Software Development",
"Topic :: Scientific/Engineering",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]
[project.optional-dependencies]
dev = ["black", "flake8", "coverage", "mypy", "pandas", "hatch", "pip-audit"]
doc = ["sphinx", "myst-parser", "sphinx_rtd_theme", "sphinx-autodoc-typehints"]
[project.urls]
Home = "https://github.com/doctorado-ml/odte"
Docs = "https://odte.readthedocs.io"
[tool.hatch.version]
path = "odte/_version.py"
[tool.mypy]
exclude = ['tests']
[tool.coverage.run]
branch = true
source = ["odte"]
command_line = "-m unittest discover -s odte.tests"
[tool.coverage.report]
show_missing = true
fail_under = 100
[tool.black] [tool.black]
line-length = 79 line-length = 79
target_version = ['py311']
include = '\.pyi?$' include = '\.pyi?$'
exclude = ''' exclude = '''
/( /(
@ -13,4 +73,4 @@ exclude = '''
| build | build
| dist | dist
)/ )/
''' '''

View File

@ -1 +1,2 @@
stree>1.2.2 scikit-learn==1.5.2
stree>=1.4

View File

@ -1,46 +0,0 @@
import setuptools
def readme():
with open("README.md") as f:
return f.read()
def get_data(field):
item = ""
with open("odte/__init__.py") as f:
for line in f.readlines():
if line.startswith(f"__{field}__"):
delim = '"' if '"' in line else "'"
item = line.split(delim)[1]
break
else:
raise RuntimeError(f"Unable to find {field} string.")
return item
setuptools.setup(
name="Odte",
version=get_data("version"),
license=get_data("license"),
description="Oblique decision tree Ensemble",
long_description=readme(),
long_description_content_type="text/markdown",
packages=setuptools.find_packages(),
url="https://github.com/doctorado-ml/odte",
author=get_data("author"),
author_email=get_data("author_email"),
keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\
tree ensemble svm svc",
classifiers=[
"Development Status :: 4 - Beta",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3.8",
"Natural Language :: English",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Intended Audience :: Science/Research",
],
install_requires=["stree"],
test_suite="odte.tests",
zip_safe=False,
)