Compare commits

...

7 Commits

Author SHA1 Message Date
98881cbd45 exchange codeship badge with githubs 2021-01-11 13:02:59 +01:00
cdb9fd6faa Codacy only in Linux 2021-01-11 12:24:50 +01:00
82f7352f9a Fix python version & os 2021-01-11 12:00:32 +01:00
8359e442e5 lock scikit-learn version to 0.23.2
fix github actions workflow
2021-01-10 20:05:36 +01:00
Ricardo Montañana Gómez
673081cdc5 Add main workflow action 2021-01-10 14:24:16 +01:00
Ricardo Montañana Gómez
36816074ff Combinatorial explosion (#19)
* Remove itertools combinations from subspaces

* Generates 5 random subspaces at most
2021-01-10 13:32:22 +01:00
475ad7e752 Fix mistakes in function comments 2020-11-11 19:14:36 +01:00
7 changed files with 99 additions and 32 deletions

47
.github/workflows/main.yml vendored Normal file
View File

@@ -0,0 +1,47 @@
name: CI
on:
push:
branches: [master]
pull_request:
branches: [master]
workflow_dispatch:
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [macos-latest, ubuntu-latest]
python: [3.8]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python }}
- name: Install dependencies
run: |
pip install -q --upgrade pip
pip install -q -r requirements.txt
pip install -q --upgrade codecov coverage black flake8 codacy-coverage
- name: Lint
run: |
black --check --diff stree
flake8 --count stree
- name: Tests
run: |
coverage run -m unittest -v stree.tests
coverage xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml
- name: Run codacy-coverage-reporter
if: runner.os == 'Linux'
uses: codacy/codacy-coverage-reporter-action@master
with:
project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
coverage-reports: coverage.xml

View File

@@ -1,6 +1,6 @@
[![Codeship Status for Doctorado-ML/STree](https://app.codeship.com/projects/8b2bd350-8a1b-0138-5f2c-3ad36f3eb318/status?branch=master)](https://app.codeship.com/projects/399170) ![CI](https://github.com/Doctorado-ML/STree/workflows/CI/badge.svg)
[![codecov](https://codecov.io/gh/doctorado-ml/stree/branch/master/graph/badge.svg)](https://codecov.io/gh/doctorado-ml/stree) [![codecov](https://codecov.io/gh/doctorado-ml/stree/branch/master/graph/badge.svg)](https://codecov.io/gh/doctorado-ml/stree)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/35fa3dfd53a24a339344b33d9f9f2f3d)](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/35fa3dfd53a24a339344b33d9f9f2f3d)](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
# Stree # Stree
@@ -18,17 +18,17 @@ pip install git+https://github.com/doctorado-ml/stree
### Jupyter notebooks ### Jupyter notebooks
* [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark - [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
* [![Test](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark - [![Test](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
* [![Test2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Test features - [![Test2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Test features
* [![Adaboost](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/adaboost.ipynb) Adaboost - [![Adaboost](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/adaboost.ipynb) Adaboost
* [![Gridsearch](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/gridsearch.ipynb) Gridsearch - [![Gridsearch](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/gridsearch.ipynb) Gridsearch
* [![Test Graphics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics - [![Test Graphics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics
### Command line ### Command line

View File

@@ -1,4 +1,4 @@
numpy numpy
scikit-learn scikit-learn==0.23.2
pandas pandas
ipympl ipympl

View File

@@ -30,7 +30,7 @@ setuptools.setup(
"Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Artificial Intelligence",
"Intended Audience :: Science/Research", "Intended Audience :: Science/Research",
], ],
install_requires=["scikit-learn>=0.23.0", "numpy", "ipympl"], install_requires=["scikit-learn==0.23.2", "numpy", "ipympl"],
test_suite="stree.tests", test_suite="stree.tests",
zip_safe=False, zip_safe=False,
) )

View File

@@ -10,8 +10,8 @@ import os
import numbers import numbers
import random import random
import warnings import warnings
from math import log from math import log, factorial
from itertools import combinations from typing import Optional
import numpy as np import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import SVC, LinearSVC from sklearn.svm import SVC, LinearSVC
@@ -253,19 +253,32 @@ class Splitter:
selected = feature_set selected = feature_set
return selected if selected is not None else feature_set return selected if selected is not None else feature_set
@staticmethod
def _generate_spaces(features: int, max_features: int) -> list:
comb = set()
# Generate at most 5 combinations
if max_features == features:
set_length = 1
else:
number = factorial(features) / (
factorial(max_features) * factorial(features - max_features)
)
set_length = min(5, number)
while len(comb) < set_length:
comb.add(
tuple(sorted(random.sample(range(features), max_features)))
)
return list(comb)
def _get_subspaces_set( def _get_subspaces_set(
self, dataset: np.array, labels: np.array, max_features: int self, dataset: np.array, labels: np.array, max_features: int
) -> np.array: ) -> np.array:
features = range(dataset.shape[1]) features_sets = self._generate_spaces(dataset.shape[1], max_features)
features_sets = list(combinations(features, max_features))
if len(features_sets) > 1: if len(features_sets) > 1:
if self._splitter_type == "random": if self._splitter_type == "random":
index = random.randint(0, len(features_sets) - 1) index = random.randint(0, len(features_sets) - 1)
return features_sets[index] return features_sets[index]
else: else:
# get only 3 sets at most
if len(features_sets) > 3:
features_sets = random.sample(features_sets, 3)
return self._select_best_set(dataset, labels, features_sets) return self._select_best_set(dataset, labels, features_sets)
else: else:
return features_sets[0] return features_sets[0]
@@ -284,9 +297,8 @@ class Splitter:
:type data: np.array (m, n_classes) :type data: np.array (m, n_classes)
:param y: vector of labels (classes) :param y: vector of labels (classes)
:type y: np.array (m,) :type y: np.array (m,)
:return: vector with the class assigned to each sample values :return: column of dataset to be taken into account to split dataset
(can be 0, 1, ...) -1 if none produces information gain :rtype: int
:rtype: np.array shape (m,)
""" """
max_gain = 0 max_gain = 0
selected = -1 selected = -1
@@ -307,8 +319,8 @@ class Splitter:
:type data: np.array (m, n_classes) :type data: np.array (m, n_classes)
:param y: vector of labels (classes) :param y: vector of labels (classes)
:type y: np.array (m,) :type y: np.array (m,)
:return: vector with distances to hyperplane (can be positive or neg.) :return: column of dataset to be taken into account to split dataset
:rtype: np.array shape (m,) :rtype: int
""" """
# select the class with max number of samples # select the class with max number of samples
_, samples = np.unique(y, return_counts=True) _, samples = np.unique(y, return_counts=True)
@@ -489,7 +501,7 @@ class Stree(BaseEstimator, ClassifierMixin):
sample_weight: np.ndarray, sample_weight: np.ndarray,
depth: int, depth: int,
title: str, title: str,
) -> Snode: ) -> Optional[Snode]:
"""Recursive function to split the original dataset into predictor """Recursive function to split the original dataset into predictor
nodes (leaves) nodes (leaves)

View File

@@ -166,6 +166,14 @@ class Splitter_test(unittest.TestCase):
self.assertEqual((6,), computed_data.shape) self.assertEqual((6,), computed_data.shape)
self.assertListEqual(expected.tolist(), computed_data.tolist()) self.assertListEqual(expected.tolist(), computed_data.tolist())
def test_generate_subspaces(self):
features = 250
for max_features in range(2, features):
num = len(Splitter._generate_spaces(features, max_features))
self.assertEqual(5, num)
self.assertEqual(3, len(Splitter._generate_spaces(3, 2)))
self.assertEqual(4, len(Splitter._generate_spaces(4, 3)))
def test_best_splitter_few_sets(self): def test_best_splitter_few_sets(self):
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
X = np.delete(X, 3, 1) X = np.delete(X, 3, 1)
@@ -176,14 +184,14 @@ class Splitter_test(unittest.TestCase):
def test_splitter_parameter(self): def test_splitter_parameter(self):
expected_values = [ expected_values = [
[0, 1, 7, 9], # best entropy max_samples [1, 4, 9, 12], # best entropy max_samples
[3, 8, 10, 11], # best entropy impurity [1, 3, 6, 10], # best entropy impurity
[0, 2, 8, 12], # best gini max_samples [6, 8, 10, 12], # best gini max_samples
[1, 2, 5, 12], # best gini impurity [7, 8, 10, 11], # best gini impurity
[1, 2, 5, 10], # random entropy max_samples [0, 3, 8, 12], # random entropy max_samples
[4, 8, 9, 12], # random entropy impurity [0, 3, 9, 11], # random entropy impurity
[3, 9, 11, 12], # random gini max_samples [0, 4, 7, 12], # random gini max_samples
[1, 5, 6, 9], # random gini impurity [0, 2, 5, 6], # random gini impurity
] ]
X, y = load_wine(return_X_y=True) X, y = load_wine(return_X_y=True)
rn = 0 rn = 0

View File

@@ -313,7 +313,7 @@ class Stree_test(unittest.TestCase):
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)
clf = Stree(random_state=self._random_state, max_features=2) clf = Stree(random_state=self._random_state, max_features=2)
clf.fit(X, y) clf.fit(X, y)
self.assertAlmostEqual(0.944, clf.score(X, y)) self.assertAlmostEqual(0.9246666666666666, clf.score(X, y))
def test_bogus_splitter_parameter(self): def test_bogus_splitter_parameter(self):
clf = Stree(splitter="duck") clf = Stree(splitter="duck")