diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..78a0f78 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,14 @@ +[run] +branch = True +source = odte + +[report] +exclude_lines = + if self.debug: + pragma: no cover + raise NotImplementedError + if __name__ == .__main__.: +ignore_errors = True +omit = + odte/tests/* + odte/__init__.py \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..776119f --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,10 @@ +repos: +- repo: https://github.com/ambv/black + rev: stable + hooks: + - id: black + language_version: python3.7 +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.7.9 + hooks: + - id: flake8 \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..0f6e6bd --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense para saber los atributos posibles. + // Mantenga el puntero para ver las descripciones de los existentes atributos. + // Para más información, visite: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Archivo actual", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..db397ef --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,18 @@ +{ + "python.testing.unittestArgs": [ + + ], + "python.testing.pytestEnabled": false, + "python.testing.nosetestsEnabled": false, + "python.testing.unittestEnabled": true, + "python.pythonPath": "/Users/rmontanana/.virtualenvs/general/bin/python", + "python.linting.flake8Enabled": true, + "python.linting.enabled": true, + "editor.rulers": [ + 80, + 100 + ], + "python.linting.pylintEnabled": false, + "restructuredtext.confPath": "${workspaceFolder}/docs/source" + +} \ No newline at end of file diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..222249f --- /dev/null +++ b/codecov.yml @@ -0,0 +1,12 @@ +overage: + status: + project: + default: + target: 90% +comment: + layout: "reach, diff, flags, files" + behavior: default + require_changes: false + require_base: yes + require_head: yes + branches: null \ No newline at end of file diff --git a/odte/Odte.py b/odte/Odte.py new file mode 100644 index 0000000..e1cbfa0 --- /dev/null +++ b/odte/Odte.py @@ -0,0 +1,152 @@ +""" +__author__ = "Ricardo Montañana Gómez" +__copyright__ = "Copyright 2020, Ricardo Montañana Gómez" +__license__ = "MIT" +__version__ = "0.1" +Build a forest of oblique trees based on STree +""" + +import numpy as np + +from sklearn.utils import check_consistent_length +from sklearn.metrics._classification import _weighted_sum, _check_targets +from sklearn.utils.multiclass import check_classification_targets +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.utils.validation import ( + check_X_y, + check_array, + check_is_fitted, + _check_sample_weight, +) + +from stree import Stree + + +class Odte(BaseEstimator, ClassifierMixin): + def __init__( + self, + random_state: int = None, + C: int = 1, + n_estimators: int = 100, + max_iter: int = 1000, + max_depth: int = None, + min_samples_split: int = 0, + bootstrap: bool = True, + split_criteria: str = "min_distance", + tol: float = 1e-4, + gamma="scale", + degree: int = 3, + kernel: str = "linear", + max_features="auto", + max_samples=None, + ): + self.n_estimators = n_estimators + self.bootstrap = bootstrap + self.random_state = random_state + self.max_features = max_features + self.max_samples = max_samples + self.estimator_params = dict( + C=C, + random_state=random_state, + min_samples_split=min_samples_split, + max_depth=max_depth, + split_criteria=split_criteria, + kernel=kernel, + max_iter=max_iter, + tol=tol, + degree=degree, + gamma=gamma, + ) + + def _initialize_random(self) -> np.random.mtrand.RandomState: + if self.random_state is None: + return np.random.mtrand._rand + else: + return np.random.RandomState(self.random_state) + + def _initialize_sample_weight( + self, sample_weight: np.array, n_samples: int + ) -> np.array: + if sample_weight is None: + return np.ones((n_samples,), dtype=np.float64) + else: + return sample_weight.copy() + + def fit( + self, X: np.array, y: np.array, sample_weight: np.array = None + ) -> "Odte": + # Check parameters are Ok. + if self.n_estimators < 10: + raise ValueError( + f"n_estimators must be greater than 9... got (n_estimators=\ + {self.n_estimators:f})" + ) + # the rest of parameters are checked in estimator + check_classification_targets(y) + X, y = check_X_y(X, y) + sample_weight = _check_sample_weight(sample_weight, X) + check_classification_targets(y) + # Initialize computed parameters + self.classes_, y = np.unique(y, return_inverse=True) + self.n_classes_ = self.classes_.shape[0] + self.estimators_ = [] + self._train(X, y, sample_weight) + + def _train( + self, X: np.array, y: np.array, sample_weight: np.array + ) -> "Odte": + random_box = self._initialize_random() + n_samples = X.shape[0] + weights = self._initialize_sample_weight(sample_weight, n_samples) + boot_samples = self._get_bootstrap_n_samples(n_samples) + for _ in range(self.n_estimators): + # Build clf + clf = Stree().set_params(**self.estimator_params) + self.estimators_.append(clf) + # bootstrap + indices = random_box.randint(0, n_samples, boot_samples) + # update weights with the chosen samples + weights_update = np.bincount(indices, minlength=n_samples) + current_weights = weights * weights_update + # train the classifier + clf.fit(X[indices, :], y[indices, :], current_weights[indices, :]) + + def _get_bootstrap_n_samples(self, n_samples) -> int: + if self.max_samples is None: + return n_samples + if type(self.max_samples) == int: + if not (1 <= self.max_samples <= n_samples): + message = f"max_samples should be in the range 1 to \ + {n_samples} but got {self.max_samples}" + raise ValueError(message) + return self.max_samples + if type(self.max_samples) == float: + if not (0 < self.max_samples < 1): + message = f"max_samples should be in the range (0, 1)\ + but got {self.max_samples}" + raise ValueError(message) + return int(round(self.max_samples * n_samples)) + raise ValueError( + f"Expected values int, float but got \ + {type(self.max_samples)}" + ) + + def predict(self, X: np.array): + # todo + check_is_fitted(self, ["estimators_"]) + # Input validation + X = check_array(X) + + def score( + self, X: np.array, y: np.array, sample_weight: np.array + ) -> float: + # todo + check_is_fitted(self, ["estimators_"]) + check_classification_targets(y) + X, y = check_X_y(X, y) + y_pred = self.predict(X).reshape(y.shape) + # Compute accuracy for each possible representation + y_type, y_true, y_pred = _check_targets(y, y_pred) + check_consistent_length(y_true, y_pred, sample_weight) + score = y_true == y_pred + return _weighted_sum(score, sample_weight, normalize=True) diff --git a/odte/__init__.py b/odte/__init__.py new file mode 100644 index 0000000..660abd7 --- /dev/null +++ b/odte/__init__.py @@ -0,0 +1,3 @@ +from .Odte import Odte + +__all__ = ["Odte"] diff --git a/odte/tests/Odte_tests.py b/odte/tests/Odte_tests.py new file mode 100644 index 0000000..9f4fdee --- /dev/null +++ b/odte/tests/Odte_tests.py @@ -0,0 +1,49 @@ +import unittest +import numpy as np + +from odte import Odte +from .utils import load_dataset + + +class Odte_test(unittest.TestCase): + def __init__(self, *args, **kwargs): + self._random_state = 1 + super().__init__(*args, **kwargs) + + def test_max_samples_bogus(self): + values = [0, 3000, 1.1, 0.0, "hi!"] + for max_samples in values: + with self.assertRaises(ValueError): + tclf = Odte(max_samples=max_samples) + tclf.fit(*load_dataset(self._random_state)) + + def test_get_bootstrap_nsamples(self): + expected_values = [(1, 1), (1500, 1500), (0.1, 150)] + for value, expected in expected_values: + tclf = Odte(max_samples=value) + computed = tclf._get_bootstrap_n_samples(1500) + self.assertEqual(expected, computed) + + def test_initialize_sample_weight(self): + m = 5 + ones = np.ones(m,) + weights = np.random.rand(m,) + expected_values = [(None, ones), (weights, weights)] + for value, expected in expected_values: + tclf = Odte() + computed = tclf._initialize_sample_weight(value, m) + self.assertListEqual(expected.tolist(), computed.tolist()) + + def test_initialize_random(self): + expected = [37, 235, 908] + tclf = Odte(random_state=self._random_state) + box = tclf._initialize_random() + computed = box.randint(0, 1000, 3) + self.assertListEqual(expected, computed.tolist()) + # test None + tclf = Odte() + box = tclf._initialize_random() + computed = box.randint(101, 1000, 3) + for value in computed.tolist(): + self.assertGreaterEqual(value, 101) + self.assertLessEqual(value, 1000) diff --git a/odte/tests/__init__.py b/odte/tests/__init__.py new file mode 100644 index 0000000..b76dda9 --- /dev/null +++ b/odte/tests/__init__.py @@ -0,0 +1,3 @@ +from .Odte_tests import Odte_test + +__all__ = ["Odte_test"] diff --git a/odte/tests/utils.py b/odte/tests/utils.py new file mode 100644 index 0000000..a371e88 --- /dev/null +++ b/odte/tests/utils.py @@ -0,0 +1,17 @@ +from sklearn.datasets import make_classification + + +def load_dataset(random_state=0, n_classes=2): + X, y = make_classification( + n_samples=1500, + n_features=3, + n_informative=3, + n_redundant=0, + n_repeated=0, + n_classes=n_classes, + n_clusters_per_class=2, + class_sep=1.5, + flip_y=0, + random_state=random_state, + ) + return X, y diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..9bd6669 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,16 @@ +[tool.black] +line-length = 79 +include = '\.pyi?$' +exclude = ''' +/( + \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist +)/ +''' \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..58c939a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +numpy +scikit-learn +pandas +ipympl +stree \ No newline at end of file