From 580c93d92aafbb3b0e7370aee90d0a3d10496e5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Tue, 16 Jun 2020 13:59:17 +0200 Subject: [PATCH] Add setup.py remove boostrap boolean parameter --- notebooks/benchmark.ipynb | 104 ++++++++++++++++++-------------------- odte/Odte.py | 7 ++- setup.py | 36 +++++++++++++ 3 files changed, 88 insertions(+), 59 deletions(-) create mode 100644 setup.py diff --git a/notebooks/benchmark.ipynb b/notebooks/benchmark.ipynb index efcd5b8..6d7bb87 100644 --- a/notebooks/benchmark.ipynb +++ b/notebooks/benchmark.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Compare STree with different estimators" + "# Compare Odte with different estimators" ] }, { @@ -12,24 +12,25 @@ "metadata": {}, "source": [ "# Setup\n", - "Uncomment the next cell if STree is not already installed" + "Uncomment the next cell if Odte is not already installed" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "#\n", "# Google Colab setup\n", "#\n", + "#!pip install git+https://github.com/doctorado-ml/odte\n", "#!pip install git+https://github.com/doctorado-ml/stree" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -41,12 +42,12 @@ "from sklearn.metrics import classification_report, confusion_matrix, f1_score\n", "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier\n", "from stree import Stree\n", - "from odte import" + "from odte import Odte" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -65,15 +66,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 22, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", - "text": [ - "2020-06-15 00:46:56\n" - ] + "name": "stdout", + "text": "2020-06-15 11:44:45\n" } ], "source": [ @@ -89,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -101,16 +100,13 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 24, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", - "text": [ - "Fraud: 0.173% 492\n", - "Valid: 99.827% 284,315\n" - ] + "name": "stdout", + "text": "Fraud: 0.173% 492\nValid: 99.827% 284,315\n" } ], "source": [ @@ -120,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -132,16 +128,13 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 26, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", - "text": [ - "X shape: (284807, 29)\n", - "y shape: (284807,)\n" - ] + "name": "stdout", + "text": "X shape: (284807, 29)\ny shape: (284807,)\n" } ], "source": [ @@ -160,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -171,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -181,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -191,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -201,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -211,7 +204,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -221,23 +214,12 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 33, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'Odte' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0modte\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mOdte\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'Odte' is not defined" - ] - } - ], + "outputs": [], "source": [ - "odte = Odte(random_state=random_state)" + "# Oblique Decision Tree Ensemble\n", + "odte = Odte(random_state=random_state, n_estimators=10, max_features=None)" ] }, { @@ -249,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -274,9 +256,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "************************** Linear Tree **********************\nTrain Model Linear Tree took: 14.78 seconds\n=========== Linear Tree - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Linear Tree - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999578 0.999613 0.999596 85295\n 1 0.772414 0.756757 0.764505 148\n\n accuracy 0.999192 85443\n macro avg 0.885996 0.878185 0.882050 85443\nweighted avg 0.999184 0.999192 0.999188 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85262 33]\n [ 36 112]]\n************************** Random Forest **********************\nTrain Model Random Forest took: 163.9 seconds\n=========== Random Forest - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Random Forest - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999660 0.999965 0.999812 85295\n 1 0.975410 0.804054 0.881481 148\n\n accuracy 0.999625 85443\n macro avg 0.987535 0.902009 0.940647 85443\nweighted avg 0.999618 0.999625 0.999607 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85292 3]\n [ 29 119]]\n************************** Stree (SVM Tree) **********************\nTrain Model Stree (SVM Tree) took: 34.57 seconds\n=========== Stree (SVM Tree) - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999623 0.999864 0.999744 199020\n 1 0.908784 0.781977 0.840625 344\n\n accuracy 0.999488 199364\n macro avg 0.954204 0.890921 0.920184 199364\nweighted avg 0.999467 0.999488 0.999469 199364\n\n=========== Stree (SVM Tree) - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999637 0.999918 0.999777 85295\n 1 0.943548 0.790541 0.860294 148\n\n accuracy 0.999555 85443\n macro avg 0.971593 0.895229 0.930036 85443\nweighted avg 0.999540 0.999555 0.999536 85443\n\nConfusion Matrix in Train\n[[198993 27]\n [ 75 269]]\nConfusion Matrix in Test\n[[85288 7]\n [ 31 117]]\n************************** AdaBoost model **********************\nTrain Model AdaBoost model took: 44.36 seconds\n=========== AdaBoost model - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999392 0.999678 0.999535 199020\n 1 0.777003 0.648256 0.706815 344\n\n accuracy 0.999072 199364\n macro avg 0.888198 0.823967 0.853175 199364\nweighted avg 0.999008 0.999072 0.999030 199364\n\n=========== AdaBoost model - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999484 0.999707 0.999596 85295\n 1 0.806202 0.702703 0.750903 148\n\n accuracy 0.999192 85443\n macro avg 0.902843 0.851205 0.875249 85443\nweighted avg 0.999149 0.999192 0.999165 85443\n\nConfusion Matrix in Train\n[[198956 64]\n [ 121 223]]\nConfusion Matrix in Test\n[[85270 25]\n [ 44 104]]\n************************** Odte **********************\nTrain Model Odte took: 2.134e+03 seconds\n=========== Odte - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999583 1.000000 0.999792 199020\n 1 1.000000 0.758721 0.862810 344\n\n accuracy 0.999584 199364\n macro avg 0.999792 0.879360 0.931301 199364\nweighted avg 0.999584 0.999584 0.999555 199364\n\n=========== Odte - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999543 0.999965 0.999754 85295\n 1 0.973214 0.736486 0.838462 148\n\n accuracy 0.999508 85443\n macro avg 0.986379 0.868226 0.919108 85443\nweighted avg 0.999497 0.999508 0.999474 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 83 261]]\nConfusion Matrix in Test\n[[85292 3]\n [ 39 109]]\n" + } + ], "source": [ "# Train & Test models\n", "models = {\n", @@ -297,9 +285,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "**************************************************************************************************************\n*The best f1 model is Random Forest, with a f1 score: 0.8815 in 163.896 seconds with 0.7 samples in train dataset\n**************************************************************************************************************\nModel: Linear Tree\t Time: 14.78 seconds\t f1: 0.7645\nModel: Random Forest\t Time: 163.90 seconds\t f1: 0.8815\nModel: Stree (SVM Tree)\t Time: 34.57 seconds\t f1: 0.8603\nModel: AdaBoost model\t Time: 44.36 seconds\t f1: 0.7509\nModel: Odte\t Time: 2134.25 seconds\t f1: 0.8385\n" + } + ], "source": [ "print(\"*\"*110)\n", "print(f\"*The best f1 model is {best_model}, with a f1 score: {best_f1:.4} in {best_time:.6} seconds with {train_size:,} samples in train dataset\")\n", @@ -343,9 +337,9 @@ "metadata": { "hide_input": false, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.7.6 64-bit ('general': venv)", "language": "python", - "name": "python3" + "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39" }, "language_info": { "codemirror_mode": { @@ -357,7 +351,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.7.6-final" }, "toc": { "base_numbering": 1, @@ -411,4 +405,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/odte/Odte.py b/odte/Odte.py index 27846df..d9546a1 100644 --- a/odte/Odte.py +++ b/odte/Odte.py @@ -32,7 +32,6 @@ class Odte(BaseEstimator, ClassifierMixin): max_iter: int = 1000, max_depth: int = None, min_samples_split: int = 0, - bootstrap: bool = True, split_criteria: str = "min_distance", criterion: str = "gini", tol: float = 1e-4, @@ -44,10 +43,9 @@ class Odte(BaseEstimator, ClassifierMixin): splitter: str = "random", ): self.n_estimators = n_estimators - self.bootstrap = bootstrap self.random_state = random_state self.max_features = max_features - self.max_samples = max_samples + self.max_samples = max_samples # size of bootstrap self.estimator_params = dict( C=C, random_state=random_state, @@ -70,8 +68,9 @@ class Odte(BaseEstimator, ClassifierMixin): else: return np.random.RandomState(self.random_state) + @staticmethod def _initialize_sample_weight( - self, sample_weight: np.array, n_samples: int + sample_weight: np.array, n_samples: int ) -> np.array: if sample_weight is None: return np.ones((n_samples,), dtype=np.float64) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..0875685 --- /dev/null +++ b/setup.py @@ -0,0 +1,36 @@ +import setuptools + +__version__ = "0.1.0" +__author__ = "Ricardo Montañana Gómez" + + +def readme(): + with open("README.md") as f: + return f.read() + + +setuptools.setup( + name="Odte", + version=__version__, + license="MIT License", + description="Oblique decision tree Ensemble", + long_description=readme(), + long_description_content_type="text/markdown", + packages=setuptools.find_packages(), + url="https://github.com/doctorado-ml/stree", + author=__author__, + author_email="ricardo.montanana@alu.uclm.es", + keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\ + tree ensemble svm svc", + classifiers=[ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.7", + "Natural Language :: English", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Intended Audience :: Science/Research", + ], + install_requires=["scikit-learn>=0.23.0", "numpy", "ipympl", "stree"], + test_suite="odte.tests", + zip_safe=False, +)