Compare commits

..

3 Commits

Author SHA1 Message Date
ddc0fe15b8 Fix mistake in computing multiclass node belief
Set default criterion for split to entropy instead of gini
Set default max_iter to 1e5 instead of 1e3
change up-down criterion to match SVC multiclass
Fix impurity method of splitting nodes
Update jupyter Notebooks
2020-11-01 17:37:17 +01:00
c593b55bec Complete implementation of splitter_type = impurity with tests
Remove max_distance & min_distance splitter types
2020-10-17 16:56:15 +02:00
044918f834 #15 First approach
Create impurity function in Stree (consistent name, same criteria as other splitter parameter)
Create test for the new function
Update init test
Update test splitter parameters
Rename old impurity function to partition_impurity
2020-10-15 17:51:20 +02:00
7 changed files with 32 additions and 99 deletions

View File

@@ -1,47 +0,0 @@
name: CI
on:
push:
branches: [master]
pull_request:
branches: [master]
workflow_dispatch:
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [macos-latest, ubuntu-latest]
python: [3.8]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python }}
- name: Install dependencies
run: |
pip install -q --upgrade pip
pip install -q -r requirements.txt
pip install -q --upgrade codecov coverage black flake8 codacy-coverage
- name: Lint
run: |
black --check --diff stree
flake8 --count stree
- name: Tests
run: |
coverage run -m unittest -v stree.tests
coverage xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml
- name: Run codacy-coverage-reporter
if: runner.os == 'Linux'
uses: codacy/codacy-coverage-reporter-action@master
with:
project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
coverage-reports: coverage.xml

View File

@@ -1,6 +1,6 @@
![CI](https://github.com/Doctorado-ML/STree/workflows/CI/badge.svg)
[![Codeship Status for Doctorado-ML/STree](https://app.codeship.com/projects/8b2bd350-8a1b-0138-5f2c-3ad36f3eb318/status?branch=master)](https://app.codeship.com/projects/399170)
[![codecov](https://codecov.io/gh/doctorado-ml/stree/branch/master/graph/badge.svg)](https://codecov.io/gh/doctorado-ml/stree)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/35fa3dfd53a24a339344b33d9f9f2f3d)](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/35fa3dfd53a24a339344b33d9f9f2f3d)](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
# Stree
@@ -18,17 +18,17 @@ pip install git+https://github.com/doctorado-ml/stree
### Jupyter notebooks
- [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
* [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
- [![Test](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
* [![Test](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
- [![Test2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Test features
* [![Test2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Test features
- [![Adaboost](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/adaboost.ipynb) Adaboost
* [![Adaboost](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/adaboost.ipynb) Adaboost
- [![Gridsearch](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/gridsearch.ipynb) Gridsearch
* [![Gridsearch](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/gridsearch.ipynb) Gridsearch
- [![Test Graphics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics
* [![Test Graphics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics
### Command line

View File

@@ -1,4 +1,4 @@
numpy
scikit-learn==0.23.2
scikit-learn
pandas
ipympl

View File

@@ -30,7 +30,7 @@ setuptools.setup(
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Intended Audience :: Science/Research",
],
install_requires=["scikit-learn==0.23.2", "numpy", "ipympl"],
install_requires=["scikit-learn>=0.23.0", "numpy", "ipympl"],
test_suite="stree.tests",
zip_safe=False,
)

View File

@@ -10,8 +10,8 @@ import os
import numbers
import random
import warnings
from math import log, factorial
from typing import Optional
from math import log
from itertools import combinations
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import SVC, LinearSVC
@@ -253,32 +253,19 @@ class Splitter:
selected = feature_set
return selected if selected is not None else feature_set
@staticmethod
def _generate_spaces(features: int, max_features: int) -> list:
comb = set()
# Generate at most 5 combinations
if max_features == features:
set_length = 1
else:
number = factorial(features) / (
factorial(max_features) * factorial(features - max_features)
)
set_length = min(5, number)
while len(comb) < set_length:
comb.add(
tuple(sorted(random.sample(range(features), max_features)))
)
return list(comb)
def _get_subspaces_set(
self, dataset: np.array, labels: np.array, max_features: int
) -> np.array:
features_sets = self._generate_spaces(dataset.shape[1], max_features)
features = range(dataset.shape[1])
features_sets = list(combinations(features, max_features))
if len(features_sets) > 1:
if self._splitter_type == "random":
index = random.randint(0, len(features_sets) - 1)
return features_sets[index]
else:
# get only 3 sets at most
if len(features_sets) > 3:
features_sets = random.sample(features_sets, 3)
return self._select_best_set(dataset, labels, features_sets)
else:
return features_sets[0]
@@ -297,8 +284,9 @@ class Splitter:
:type data: np.array (m, n_classes)
:param y: vector of labels (classes)
:type y: np.array (m,)
:return: column of dataset to be taken into account to split dataset
:rtype: int
:return: vector with the class assigned to each sample values
(can be 0, 1, ...) -1 if none produces information gain
:rtype: np.array shape (m,)
"""
max_gain = 0
selected = -1
@@ -319,8 +307,8 @@ class Splitter:
:type data: np.array (m, n_classes)
:param y: vector of labels (classes)
:type y: np.array (m,)
:return: column of dataset to be taken into account to split dataset
:rtype: int
:return: vector with distances to hyperplane (can be positive or neg.)
:rtype: np.array shape (m,)
"""
# select the class with max number of samples
_, samples = np.unique(y, return_counts=True)
@@ -501,7 +489,7 @@ class Stree(BaseEstimator, ClassifierMixin):
sample_weight: np.ndarray,
depth: int,
title: str,
) -> Optional[Snode]:
) -> Snode:
"""Recursive function to split the original dataset into predictor
nodes (leaves)

View File

@@ -166,14 +166,6 @@ class Splitter_test(unittest.TestCase):
self.assertEqual((6,), computed_data.shape)
self.assertListEqual(expected.tolist(), computed_data.tolist())
def test_generate_subspaces(self):
features = 250
for max_features in range(2, features):
num = len(Splitter._generate_spaces(features, max_features))
self.assertEqual(5, num)
self.assertEqual(3, len(Splitter._generate_spaces(3, 2)))
self.assertEqual(4, len(Splitter._generate_spaces(4, 3)))
def test_best_splitter_few_sets(self):
X, y = load_iris(return_X_y=True)
X = np.delete(X, 3, 1)
@@ -184,14 +176,14 @@ class Splitter_test(unittest.TestCase):
def test_splitter_parameter(self):
expected_values = [
[1, 4, 9, 12], # best entropy max_samples
[1, 3, 6, 10], # best entropy impurity
[6, 8, 10, 12], # best gini max_samples
[7, 8, 10, 11], # best gini impurity
[0, 3, 8, 12], # random entropy max_samples
[0, 3, 9, 11], # random entropy impurity
[0, 4, 7, 12], # random gini max_samples
[0, 2, 5, 6], # random gini impurity
[0, 1, 7, 9], # best entropy max_samples
[3, 8, 10, 11], # best entropy impurity
[0, 2, 8, 12], # best gini max_samples
[1, 2, 5, 12], # best gini impurity
[1, 2, 5, 10], # random entropy max_samples
[4, 8, 9, 12], # random entropy impurity
[3, 9, 11, 12], # random gini max_samples
[1, 5, 6, 9], # random gini impurity
]
X, y = load_wine(return_X_y=True)
rn = 0

View File

@@ -313,7 +313,7 @@ class Stree_test(unittest.TestCase):
X, y = load_dataset(self._random_state)
clf = Stree(random_state=self._random_state, max_features=2)
clf.fit(X, y)
self.assertAlmostEqual(0.9246666666666666, clf.score(X, y))
self.assertAlmostEqual(0.944, clf.score(X, y))
def test_bogus_splitter_parameter(self):
clf = Stree(splitter="duck")