First commit

2025-07-11 16:22:00 +00:00 · 2020-06-12 22:45:58 +02:00 · 2020-06-12 22:45:58 +02:00 · 872914dca7
commit 872914dca7
parent eae2eaf663
12 changed files with 314 additions and 0 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -0,0 +1,14 @@
+[run]
+branch = True
+source = odte
+
+[report]
+exclude_lines =
+    if self.debug:
+    pragma: no cover
+    raise NotImplementedError
+    if __name__ == .__main__.:
+ignore_errors = True
+omit =
+    odte/tests/*
+    odte/__init__.py
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,10 @@
+repos:
+-   repo: https://github.com/ambv/black
+    rev: stable
+    hooks:
+    - id: black
+      language_version: python3.7
+-   repo: https://gitlab.com/pycqa/flake8
+    rev: 3.7.9
+    hooks:
+    - id: flake8
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,15 @@
+{
+    // Use IntelliSense para saber los atributos posibles.
+    // Mantenga el puntero para ver las descripciones de los existentes atributos.
+    // Para más información, visite: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Archivo actual",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        }
+    ]
+}
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,18 @@
+{
+    "python.testing.unittestArgs": [
+        
+    ],
+    "python.testing.pytestEnabled": false,
+    "python.testing.nosetestsEnabled": false,
+    "python.testing.unittestEnabled": true,
+    "python.pythonPath": "/Users/rmontanana/.virtualenvs/general/bin/python",
+    "python.linting.flake8Enabled": true,
+    "python.linting.enabled": true,
+    "editor.rulers": [
+        80,
+        100
+    ],
+    "python.linting.pylintEnabled": false,
+    "restructuredtext.confPath": "${workspaceFolder}/docs/source"
+   
+}
--- a/codecov.yml
+++ b/codecov.yml
@ -0,0 +1,12 @@
+overage:
+  status:
+    project:
+      default:
+        target: 90%
+comment:
+  layout: "reach, diff, flags, files"
+  behavior: default
+  require_changes: false  
+  require_base: yes
+  require_head: yes       
+  branches: null
--- a/odte/Odte.py
+++ b/odte/Odte.py
@ -0,0 +1,152 @@
+"""
+__author__ = "Ricardo Montañana Gómez"
+__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
+__license__ = "MIT"
+__version__ = "0.1"
+Build a forest of oblique trees based on STree
+"""
+
+import numpy as np
+
+from sklearn.utils import check_consistent_length
+from sklearn.metrics._classification import _weighted_sum, _check_targets
+from sklearn.utils.multiclass import check_classification_targets
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.utils.validation import (
+    check_X_y,
+    check_array,
+    check_is_fitted,
+    _check_sample_weight,
+)
+
+from stree import Stree
+
+
+class Odte(BaseEstimator, ClassifierMixin):
+    def __init__(
+        self,
+        random_state: int = None,
+        C: int = 1,
+        n_estimators: int = 100,
+        max_iter: int = 1000,
+        max_depth: int = None,
+        min_samples_split: int = 0,
+        bootstrap: bool = True,
+        split_criteria: str = "min_distance",
+        tol: float = 1e-4,
+        gamma="scale",
+        degree: int = 3,
+        kernel: str = "linear",
+        max_features="auto",
+        max_samples=None,
+    ):
+        self.n_estimators = n_estimators
+        self.bootstrap = bootstrap
+        self.random_state = random_state
+        self.max_features = max_features
+        self.max_samples = max_samples
+        self.estimator_params = dict(
+            C=C,
+            random_state=random_state,
+            min_samples_split=min_samples_split,
+            max_depth=max_depth,
+            split_criteria=split_criteria,
+            kernel=kernel,
+            max_iter=max_iter,
+            tol=tol,
+            degree=degree,
+            gamma=gamma,
+        )
+
+    def _initialize_random(self) -> np.random.mtrand.RandomState:
+        if self.random_state is None:
+            return np.random.mtrand._rand
+        else:
+            return np.random.RandomState(self.random_state)
+
+    def _initialize_sample_weight(
+        self, sample_weight: np.array, n_samples: int
+    ) -> np.array:
+        if sample_weight is None:
+            return np.ones((n_samples,), dtype=np.float64)
+        else:
+            return sample_weight.copy()
+
+    def fit(
+        self, X: np.array, y: np.array, sample_weight: np.array = None
+    ) -> "Odte":
+        # Check parameters are Ok.
+        if self.n_estimators < 10:
+            raise ValueError(
+                f"n_estimators must be greater than 9... got (n_estimators=\
+                    {self.n_estimators:f})"
+            )
+        # the rest of parameters are checked in estimator
+        check_classification_targets(y)
+        X, y = check_X_y(X, y)
+        sample_weight = _check_sample_weight(sample_weight, X)
+        check_classification_targets(y)
+        # Initialize computed parameters
+        self.classes_, y = np.unique(y, return_inverse=True)
+        self.n_classes_ = self.classes_.shape[0]
+        self.estimators_ = []
+        self._train(X, y, sample_weight)
+
+    def _train(
+        self, X: np.array, y: np.array, sample_weight: np.array
+    ) -> "Odte":
+        random_box = self._initialize_random()
+        n_samples = X.shape[0]
+        weights = self._initialize_sample_weight(sample_weight, n_samples)
+        boot_samples = self._get_bootstrap_n_samples(n_samples)
+        for _ in range(self.n_estimators):
+            # Build clf
+            clf = Stree().set_params(**self.estimator_params)
+            self.estimators_.append(clf)
+            # bootstrap
+            indices = random_box.randint(0, n_samples, boot_samples)
+            # update weights with the chosen samples
+            weights_update = np.bincount(indices, minlength=n_samples)
+            current_weights = weights * weights_update
+            # train the classifier
+            clf.fit(X[indices, :], y[indices, :], current_weights[indices, :])
+
+    def _get_bootstrap_n_samples(self, n_samples) -> int:
+        if self.max_samples is None:
+            return n_samples
+        if type(self.max_samples) == int:
+            if not (1 <= self.max_samples <= n_samples):
+                message = f"max_samples should be in the range 1 to \
+                    {n_samples} but got {self.max_samples}"
+                raise ValueError(message)
+            return self.max_samples
+        if type(self.max_samples) == float:
+            if not (0 < self.max_samples < 1):
+                message = f"max_samples should be in the range (0, 1)\
+                    but got {self.max_samples}"
+                raise ValueError(message)
+            return int(round(self.max_samples * n_samples))
+        raise ValueError(
+            f"Expected values int, float but got \
+            {type(self.max_samples)}"
+        )
+
+    def predict(self, X: np.array):
+        # todo
+        check_is_fitted(self, ["estimators_"])
+        # Input validation
+        X = check_array(X)
+
+    def score(
+        self, X: np.array, y: np.array, sample_weight: np.array
+    ) -> float:
+        # todo
+        check_is_fitted(self, ["estimators_"])
+        check_classification_targets(y)
+        X, y = check_X_y(X, y)
+        y_pred = self.predict(X).reshape(y.shape)
+        # Compute accuracy for each possible representation
+        y_type, y_true, y_pred = _check_targets(y, y_pred)
+        check_consistent_length(y_true, y_pred, sample_weight)
+        score = y_true == y_pred
+        return _weighted_sum(score, sample_weight, normalize=True)
--- a/odte/init.py
+++ b/odte/init.py
@ -0,0 +1,3 @@
+from .Odte import Odte
+
+__all__ = ["Odte"]
--- a/odte/tests/Odte_tests.py
+++ b/odte/tests/Odte_tests.py
@ -0,0 +1,49 @@
+import unittest
+import numpy as np
+
+from odte import Odte
+from .utils import load_dataset
+
+
+class Odte_test(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        self._random_state = 1
+        super().__init__(*args, **kwargs)
+
+    def test_max_samples_bogus(self):
+        values = [0, 3000, 1.1, 0.0, "hi!"]
+        for max_samples in values:
+            with self.assertRaises(ValueError):
+                tclf = Odte(max_samples=max_samples)
+                tclf.fit(*load_dataset(self._random_state))
+
+    def test_get_bootstrap_nsamples(self):
+        expected_values = [(1, 1), (1500, 1500), (0.1, 150)]
+        for value, expected in expected_values:
+            tclf = Odte(max_samples=value)
+            computed = tclf._get_bootstrap_n_samples(1500)
+            self.assertEqual(expected, computed)
+
+    def test_initialize_sample_weight(self):
+        m = 5
+        ones = np.ones(m,)
+        weights = np.random.rand(m,)
+        expected_values = [(None, ones), (weights, weights)]
+        for value, expected in expected_values:
+            tclf = Odte()
+            computed = tclf._initialize_sample_weight(value, m)
+            self.assertListEqual(expected.tolist(), computed.tolist())
+
+    def test_initialize_random(self):
+        expected = [37, 235, 908]
+        tclf = Odte(random_state=self._random_state)
+        box = tclf._initialize_random()
+        computed = box.randint(0, 1000, 3)
+        self.assertListEqual(expected, computed.tolist())
+        # test None
+        tclf = Odte()
+        box = tclf._initialize_random()
+        computed = box.randint(101, 1000, 3)
+        for value in computed.tolist():
+            self.assertGreaterEqual(value, 101)
+            self.assertLessEqual(value, 1000)
--- a/odte/tests/init.py
+++ b/odte/tests/init.py
@ -0,0 +1,3 @@
+from .Odte_tests import Odte_test
+
+__all__ = ["Odte_test"]
--- a/odte/tests/utils.py
+++ b/odte/tests/utils.py
@ -0,0 +1,17 @@
+from sklearn.datasets import make_classification
+
+
+def load_dataset(random_state=0, n_classes=2):
+    X, y = make_classification(
+        n_samples=1500,
+        n_features=3,
+        n_informative=3,
+        n_redundant=0,
+        n_repeated=0,
+        n_classes=n_classes,
+        n_clusters_per_class=2,
+        class_sep=1.5,
+        flip_y=0,
+        random_state=random_state,
+    )
+    return X, y
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,16 @@
+[tool.black]
+line-length = 79
+include = '\.pyi?$'
+exclude = '''
+/(
+    \.git
+  | \.hg
+  | \.mypy_cache
+  | \.tox
+  | \.venv
+  | _build
+  | buck-out
+  | build
+  | dist
+)/
+'''
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
+numpy
+scikit-learn
+pandas
+ipympl
+stree