mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-17 16:36:01 +00:00
Compare commits
7 Commits
0.9rc6
...
Adding-Git
Author | SHA1 | Date | |
---|---|---|---|
98881cbd45
|
|||
cdb9fd6faa
|
|||
82f7352f9a
|
|||
8359e442e5
|
|||
|
673081cdc5 | ||
|
36816074ff | ||
475ad7e752
|
47
.github/workflows/main.yml
vendored
Normal file
47
.github/workflows/main.yml
vendored
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
name: CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [master]
|
||||||
|
pull_request:
|
||||||
|
branches: [master]
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
os: [macos-latest, ubuntu-latest]
|
||||||
|
python: [3.8]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: Set up Python ${{ matrix.python }}
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python }}
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install -q --upgrade pip
|
||||||
|
pip install -q -r requirements.txt
|
||||||
|
pip install -q --upgrade codecov coverage black flake8 codacy-coverage
|
||||||
|
- name: Lint
|
||||||
|
run: |
|
||||||
|
black --check --diff stree
|
||||||
|
flake8 --count stree
|
||||||
|
- name: Tests
|
||||||
|
run: |
|
||||||
|
coverage run -m unittest -v stree.tests
|
||||||
|
coverage xml
|
||||||
|
- name: Upload coverage to Codecov
|
||||||
|
uses: codecov/codecov-action@v1
|
||||||
|
with:
|
||||||
|
token: ${{ secrets.CODECOV_TOKEN }}
|
||||||
|
files: ./coverage.xml
|
||||||
|
- name: Run codacy-coverage-reporter
|
||||||
|
if: runner.os == 'Linux'
|
||||||
|
uses: codacy/codacy-coverage-reporter-action@master
|
||||||
|
with:
|
||||||
|
project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
|
||||||
|
coverage-reports: coverage.xml
|
16
README.md
16
README.md
@@ -1,6 +1,6 @@
|
|||||||
[](https://app.codeship.com/projects/399170)
|

|
||||||
[](https://codecov.io/gh/doctorado-ml/stree)
|
[](https://codecov.io/gh/doctorado-ml/stree)
|
||||||
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
||||||
|
|
||||||
# Stree
|
# Stree
|
||||||
|
|
||||||
@@ -18,17 +18,17 @@ pip install git+https://github.com/doctorado-ml/stree
|
|||||||
|
|
||||||
### Jupyter notebooks
|
### Jupyter notebooks
|
||||||
|
|
||||||
* [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
|
- [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Test features
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Test features
|
||||||
|
|
||||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/adaboost.ipynb) Adaboost
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/adaboost.ipynb) Adaboost
|
||||||
|
|
||||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/gridsearch.ipynb) Gridsearch
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/gridsearch.ipynb) Gridsearch
|
||||||
|
|
||||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics
|
||||||
|
|
||||||
### Command line
|
### Command line
|
||||||
|
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
numpy
|
numpy
|
||||||
scikit-learn
|
scikit-learn==0.23.2
|
||||||
pandas
|
pandas
|
||||||
ipympl
|
ipympl
|
2
setup.py
2
setup.py
@@ -30,7 +30,7 @@ setuptools.setup(
|
|||||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||||
"Intended Audience :: Science/Research",
|
"Intended Audience :: Science/Research",
|
||||||
],
|
],
|
||||||
install_requires=["scikit-learn>=0.23.0", "numpy", "ipympl"],
|
install_requires=["scikit-learn==0.23.2", "numpy", "ipympl"],
|
||||||
test_suite="stree.tests",
|
test_suite="stree.tests",
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
)
|
)
|
||||||
|
@@ -10,8 +10,8 @@ import os
|
|||||||
import numbers
|
import numbers
|
||||||
import random
|
import random
|
||||||
import warnings
|
import warnings
|
||||||
from math import log
|
from math import log, factorial
|
||||||
from itertools import combinations
|
from typing import Optional
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
from sklearn.svm import SVC, LinearSVC
|
from sklearn.svm import SVC, LinearSVC
|
||||||
@@ -253,19 +253,32 @@ class Splitter:
|
|||||||
selected = feature_set
|
selected = feature_set
|
||||||
return selected if selected is not None else feature_set
|
return selected if selected is not None else feature_set
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _generate_spaces(features: int, max_features: int) -> list:
|
||||||
|
comb = set()
|
||||||
|
# Generate at most 5 combinations
|
||||||
|
if max_features == features:
|
||||||
|
set_length = 1
|
||||||
|
else:
|
||||||
|
number = factorial(features) / (
|
||||||
|
factorial(max_features) * factorial(features - max_features)
|
||||||
|
)
|
||||||
|
set_length = min(5, number)
|
||||||
|
while len(comb) < set_length:
|
||||||
|
comb.add(
|
||||||
|
tuple(sorted(random.sample(range(features), max_features)))
|
||||||
|
)
|
||||||
|
return list(comb)
|
||||||
|
|
||||||
def _get_subspaces_set(
|
def _get_subspaces_set(
|
||||||
self, dataset: np.array, labels: np.array, max_features: int
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
) -> np.array:
|
) -> np.array:
|
||||||
features = range(dataset.shape[1])
|
features_sets = self._generate_spaces(dataset.shape[1], max_features)
|
||||||
features_sets = list(combinations(features, max_features))
|
|
||||||
if len(features_sets) > 1:
|
if len(features_sets) > 1:
|
||||||
if self._splitter_type == "random":
|
if self._splitter_type == "random":
|
||||||
index = random.randint(0, len(features_sets) - 1)
|
index = random.randint(0, len(features_sets) - 1)
|
||||||
return features_sets[index]
|
return features_sets[index]
|
||||||
else:
|
else:
|
||||||
# get only 3 sets at most
|
|
||||||
if len(features_sets) > 3:
|
|
||||||
features_sets = random.sample(features_sets, 3)
|
|
||||||
return self._select_best_set(dataset, labels, features_sets)
|
return self._select_best_set(dataset, labels, features_sets)
|
||||||
else:
|
else:
|
||||||
return features_sets[0]
|
return features_sets[0]
|
||||||
@@ -284,9 +297,8 @@ class Splitter:
|
|||||||
:type data: np.array (m, n_classes)
|
:type data: np.array (m, n_classes)
|
||||||
:param y: vector of labels (classes)
|
:param y: vector of labels (classes)
|
||||||
:type y: np.array (m,)
|
:type y: np.array (m,)
|
||||||
:return: vector with the class assigned to each sample values
|
:return: column of dataset to be taken into account to split dataset
|
||||||
(can be 0, 1, ...) -1 if none produces information gain
|
:rtype: int
|
||||||
:rtype: np.array shape (m,)
|
|
||||||
"""
|
"""
|
||||||
max_gain = 0
|
max_gain = 0
|
||||||
selected = -1
|
selected = -1
|
||||||
@@ -307,8 +319,8 @@ class Splitter:
|
|||||||
:type data: np.array (m, n_classes)
|
:type data: np.array (m, n_classes)
|
||||||
:param y: vector of labels (classes)
|
:param y: vector of labels (classes)
|
||||||
:type y: np.array (m,)
|
:type y: np.array (m,)
|
||||||
:return: vector with distances to hyperplane (can be positive or neg.)
|
:return: column of dataset to be taken into account to split dataset
|
||||||
:rtype: np.array shape (m,)
|
:rtype: int
|
||||||
"""
|
"""
|
||||||
# select the class with max number of samples
|
# select the class with max number of samples
|
||||||
_, samples = np.unique(y, return_counts=True)
|
_, samples = np.unique(y, return_counts=True)
|
||||||
@@ -489,7 +501,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
sample_weight: np.ndarray,
|
sample_weight: np.ndarray,
|
||||||
depth: int,
|
depth: int,
|
||||||
title: str,
|
title: str,
|
||||||
) -> Snode:
|
) -> Optional[Snode]:
|
||||||
"""Recursive function to split the original dataset into predictor
|
"""Recursive function to split the original dataset into predictor
|
||||||
nodes (leaves)
|
nodes (leaves)
|
||||||
|
|
||||||
|
@@ -166,6 +166,14 @@ class Splitter_test(unittest.TestCase):
|
|||||||
self.assertEqual((6,), computed_data.shape)
|
self.assertEqual((6,), computed_data.shape)
|
||||||
self.assertListEqual(expected.tolist(), computed_data.tolist())
|
self.assertListEqual(expected.tolist(), computed_data.tolist())
|
||||||
|
|
||||||
|
def test_generate_subspaces(self):
|
||||||
|
features = 250
|
||||||
|
for max_features in range(2, features):
|
||||||
|
num = len(Splitter._generate_spaces(features, max_features))
|
||||||
|
self.assertEqual(5, num)
|
||||||
|
self.assertEqual(3, len(Splitter._generate_spaces(3, 2)))
|
||||||
|
self.assertEqual(4, len(Splitter._generate_spaces(4, 3)))
|
||||||
|
|
||||||
def test_best_splitter_few_sets(self):
|
def test_best_splitter_few_sets(self):
|
||||||
X, y = load_iris(return_X_y=True)
|
X, y = load_iris(return_X_y=True)
|
||||||
X = np.delete(X, 3, 1)
|
X = np.delete(X, 3, 1)
|
||||||
@@ -176,14 +184,14 @@ class Splitter_test(unittest.TestCase):
|
|||||||
|
|
||||||
def test_splitter_parameter(self):
|
def test_splitter_parameter(self):
|
||||||
expected_values = [
|
expected_values = [
|
||||||
[0, 1, 7, 9], # best entropy max_samples
|
[1, 4, 9, 12], # best entropy max_samples
|
||||||
[3, 8, 10, 11], # best entropy impurity
|
[1, 3, 6, 10], # best entropy impurity
|
||||||
[0, 2, 8, 12], # best gini max_samples
|
[6, 8, 10, 12], # best gini max_samples
|
||||||
[1, 2, 5, 12], # best gini impurity
|
[7, 8, 10, 11], # best gini impurity
|
||||||
[1, 2, 5, 10], # random entropy max_samples
|
[0, 3, 8, 12], # random entropy max_samples
|
||||||
[4, 8, 9, 12], # random entropy impurity
|
[0, 3, 9, 11], # random entropy impurity
|
||||||
[3, 9, 11, 12], # random gini max_samples
|
[0, 4, 7, 12], # random gini max_samples
|
||||||
[1, 5, 6, 9], # random gini impurity
|
[0, 2, 5, 6], # random gini impurity
|
||||||
]
|
]
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
rn = 0
|
rn = 0
|
||||||
|
@@ -313,7 +313,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
clf = Stree(random_state=self._random_state, max_features=2)
|
clf = Stree(random_state=self._random_state, max_features=2)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
self.assertAlmostEqual(0.944, clf.score(X, y))
|
self.assertAlmostEqual(0.9246666666666666, clf.score(X, y))
|
||||||
|
|
||||||
def test_bogus_splitter_parameter(self):
|
def test_bogus_splitter_parameter(self):
|
||||||
clf = Stree(splitter="duck")
|
clf = Stree(splitter="duck")
|
||||||
|
Reference in New Issue
Block a user