From 580c93d92aafbb3b0e7370aee90d0a3d10496e5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Tue, 16 Jun 2020 13:59:17 +0200
Subject: [PATCH] Add setup.py remove boostrap boolean parameter

---
 notebooks/benchmark.ipynb | 104 ++++++++++++++++++--------------------
 odte/Odte.py              |   7 ++-
 setup.py                  |  36 +++++++++++++
 3 files changed, 88 insertions(+), 59 deletions(-)
 create mode 100644 setup.py

diff --git a/notebooks/benchmark.ipynb b/notebooks/benchmark.ipynb
index efcd5b8..6d7bb87 100644
--- a/notebooks/benchmark.ipynb
+++ b/notebooks/benchmark.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Compare STree with different estimators"
+    "# Compare Odte with different estimators"
    ]
   },
   {
@@ -12,24 +12,25 @@
    "metadata": {},
    "source": [
     "# Setup\n",
-    "Uncomment the next cell if STree is not already installed"
+    "Uncomment the next cell if Odte is not already installed"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
     "#\n",
     "# Google Colab setup\n",
     "#\n",
+    "#!pip install git+https://github.com/doctorado-ml/odte\n",
     "#!pip install git+https://github.com/doctorado-ml/stree"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -41,12 +42,12 @@
     "from sklearn.metrics import classification_report, confusion_matrix, f1_score\n",
     "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier\n",
     "from stree import Stree\n",
-    "from odte import"
+    "from odte import Odte"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -65,15 +66,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
-     "text": [
-      "2020-06-15 00:46:56\n"
-     ]
+     "name": "stdout",
+     "text": "2020-06-15 11:44:45\n"
     }
    ],
    "source": [
@@ -89,7 +88,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -101,16 +100,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
-     "text": [
-      "Fraud: 0.173% 492\n",
-      "Valid: 99.827% 284,315\n"
-     ]
+     "name": "stdout",
+     "text": "Fraud: 0.173% 492\nValid: 99.827% 284,315\n"
     }
    ],
    "source": [
@@ -120,7 +116,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -132,16 +128,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
-     "text": [
-      "X shape: (284807, 29)\n",
-      "y shape: (284807,)\n"
-     ]
+     "name": "stdout",
+     "text": "X shape: (284807, 29)\ny shape: (284807,)\n"
     }
    ],
    "source": [
@@ -160,7 +153,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -171,7 +164,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -181,7 +174,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -191,7 +184,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -201,7 +194,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -211,7 +204,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -221,23 +214,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 33,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'Odte' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-15-98265fce1448>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0modte\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mOdte\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m: name 'Odte' is not defined"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "odte = Odte(random_state=random_state)"
+    "# Oblique Decision Tree Ensemble\n",
+    "odte = Odte(random_state=random_state, n_estimators=10, max_features=None)"
    ]
   },
   {
@@ -249,7 +231,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -274,9 +256,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 35,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "************************** Linear Tree **********************\nTrain Model Linear Tree took: 14.78 seconds\n=========== Linear Tree - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   1.000000  1.000000  1.000000    199020\n           1   1.000000  1.000000  1.000000       344\n\n    accuracy                       1.000000    199364\n   macro avg   1.000000  1.000000  1.000000    199364\nweighted avg   1.000000  1.000000  1.000000    199364\n\n=========== Linear Tree - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999578  0.999613  0.999596     85295\n           1   0.772414  0.756757  0.764505       148\n\n    accuracy                       0.999192     85443\n   macro avg   0.885996  0.878185  0.882050     85443\nweighted avg   0.999184  0.999192  0.999188     85443\n\nConfusion Matrix in Train\n[[199020      0]\n [     0    344]]\nConfusion Matrix in Test\n[[85262    33]\n [   36   112]]\n************************** Random Forest **********************\nTrain Model Random Forest took: 163.9 seconds\n=========== Random Forest - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   1.000000  1.000000  1.000000    199020\n           1   1.000000  1.000000  1.000000       344\n\n    accuracy                       1.000000    199364\n   macro avg   1.000000  1.000000  1.000000    199364\nweighted avg   1.000000  1.000000  1.000000    199364\n\n=========== Random Forest - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999660  0.999965  0.999812     85295\n           1   0.975410  0.804054  0.881481       148\n\n    accuracy                       0.999625     85443\n   macro avg   0.987535  0.902009  0.940647     85443\nweighted avg   0.999618  0.999625  0.999607     85443\n\nConfusion Matrix in Train\n[[199020      0]\n [     0    344]]\nConfusion Matrix in Test\n[[85292     3]\n [   29   119]]\n************************** Stree (SVM Tree) **********************\nTrain Model Stree (SVM Tree) took: 34.57 seconds\n=========== Stree (SVM Tree) - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999623  0.999864  0.999744    199020\n           1   0.908784  0.781977  0.840625       344\n\n    accuracy                       0.999488    199364\n   macro avg   0.954204  0.890921  0.920184    199364\nweighted avg   0.999467  0.999488  0.999469    199364\n\n=========== Stree (SVM Tree) - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999637  0.999918  0.999777     85295\n           1   0.943548  0.790541  0.860294       148\n\n    accuracy                       0.999555     85443\n   macro avg   0.971593  0.895229  0.930036     85443\nweighted avg   0.999540  0.999555  0.999536     85443\n\nConfusion Matrix in Train\n[[198993     27]\n [    75    269]]\nConfusion Matrix in Test\n[[85288     7]\n [   31   117]]\n************************** AdaBoost model **********************\nTrain Model AdaBoost model took: 44.36 seconds\n=========== AdaBoost model - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999392  0.999678  0.999535    199020\n           1   0.777003  0.648256  0.706815       344\n\n    accuracy                       0.999072    199364\n   macro avg   0.888198  0.823967  0.853175    199364\nweighted avg   0.999008  0.999072  0.999030    199364\n\n=========== AdaBoost model - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999484  0.999707  0.999596     85295\n           1   0.806202  0.702703  0.750903       148\n\n    accuracy                       0.999192     85443\n   macro avg   0.902843  0.851205  0.875249     85443\nweighted avg   0.999149  0.999192  0.999165     85443\n\nConfusion Matrix in Train\n[[198956     64]\n [   121    223]]\nConfusion Matrix in Test\n[[85270    25]\n [   44   104]]\n************************** Odte **********************\nTrain Model Odte took: 2.134e+03 seconds\n=========== Odte - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999583  1.000000  0.999792    199020\n           1   1.000000  0.758721  0.862810       344\n\n    accuracy                       0.999584    199364\n   macro avg   0.999792  0.879360  0.931301    199364\nweighted avg   0.999584  0.999584  0.999555    199364\n\n=========== Odte - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999543  0.999965  0.999754     85295\n           1   0.973214  0.736486  0.838462       148\n\n    accuracy                       0.999508     85443\n   macro avg   0.986379  0.868226  0.919108     85443\nweighted avg   0.999497  0.999508  0.999474     85443\n\nConfusion Matrix in Train\n[[199020      0]\n [    83    261]]\nConfusion Matrix in Test\n[[85292     3]\n [   39   109]]\n"
+    }
+   ],
    "source": [
     "# Train & Test models\n",
     "models = {\n",
@@ -297,9 +285,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 36,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "**************************************************************************************************************\n*The best f1 model is Random Forest, with a f1 score: 0.8815 in 163.896 seconds with 0.7 samples in train dataset\n**************************************************************************************************************\nModel: Linear Tree\t Time:  14.78 seconds\t f1: 0.7645\nModel: Random Forest\t Time: 163.90 seconds\t f1: 0.8815\nModel: Stree (SVM Tree)\t Time:  34.57 seconds\t f1: 0.8603\nModel: AdaBoost model\t Time:  44.36 seconds\t f1: 0.7509\nModel: Odte\t Time: 2134.25 seconds\t f1: 0.8385\n"
+    }
+   ],
    "source": [
     "print(\"*\"*110)\n",
     "print(f\"*The best f1 model is {best_model}, with a f1 score: {best_f1:.4} in {best_time:.6} seconds with {train_size:,} samples in train dataset\")\n",
@@ -343,9 +337,9 @@
  "metadata": {
   "hide_input": false,
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.7.6 64-bit ('general': venv)",
    "language": "python",
-   "name": "python3"
+   "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
   },
   "language_info": {
    "codemirror_mode": {
@@ -357,7 +351,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.7.6-final"
   },
   "toc": {
    "base_numbering": 1,
@@ -411,4 +405,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/odte/Odte.py b/odte/Odte.py
index 27846df..d9546a1 100644
--- a/odte/Odte.py
+++ b/odte/Odte.py
@@ -32,7 +32,6 @@ class Odte(BaseEstimator, ClassifierMixin):
         max_iter: int = 1000,
         max_depth: int = None,
         min_samples_split: int = 0,
-        bootstrap: bool = True,
         split_criteria: str = "min_distance",
         criterion: str = "gini",
         tol: float = 1e-4,
@@ -44,10 +43,9 @@ class Odte(BaseEstimator, ClassifierMixin):
         splitter: str = "random",
     ):
         self.n_estimators = n_estimators
-        self.bootstrap = bootstrap
         self.random_state = random_state
         self.max_features = max_features
-        self.max_samples = max_samples
+        self.max_samples = max_samples  # size of bootstrap
         self.estimator_params = dict(
             C=C,
             random_state=random_state,
@@ -70,8 +68,9 @@ class Odte(BaseEstimator, ClassifierMixin):
         else:
             return np.random.RandomState(self.random_state)
 
+    @staticmethod
     def _initialize_sample_weight(
-        self, sample_weight: np.array, n_samples: int
+        sample_weight: np.array, n_samples: int
     ) -> np.array:
         if sample_weight is None:
             return np.ones((n_samples,), dtype=np.float64)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..0875685
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,36 @@
+import setuptools
+
+__version__ = "0.1.0"
+__author__ = "Ricardo Montañana Gómez"
+
+
+def readme():
+    with open("README.md") as f:
+        return f.read()
+
+
+setuptools.setup(
+    name="Odte",
+    version=__version__,
+    license="MIT License",
+    description="Oblique decision tree Ensemble",
+    long_description=readme(),
+    long_description_content_type="text/markdown",
+    packages=setuptools.find_packages(),
+    url="https://github.com/doctorado-ml/stree",
+    author=__author__,
+    author_email="ricardo.montanana@alu.uclm.es",
+    keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\
+    tree ensemble svm svc",
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python :: 3.7",
+        "Natural Language :: English",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Intended Audience :: Science/Research",
+    ],
+    install_requires=["scikit-learn>=0.23.0", "numpy", "ipympl", "stree"],
+    test_suite="odte.tests",
+    zip_safe=False,
+)