From ee3b3600f67b319db30ff9cbf69045e3aa728726 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?=
 <rmontanana@gmail.com>
Date: Tue, 18 Feb 2025 00:15:12 +0100
Subject: [PATCH] =?UTF-8?q?A=C3=B1ade=20Classification=20Report?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 classifier.ipynb | 69 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 15 deletions(-)

diff --git a/classifier.ipynb b/classifier.ipynb
index 68581c6..fa91c7e 100644
--- a/classifier.ipynb
+++ b/classifier.ipynb
@@ -8,9 +8,8 @@
    "source": [
     "import os\n",
     "from sklearn.ensemble import RandomForestClassifier\n",
-    "from sklearn.model_selection import cross_validate, StratifiedKFold\n",
-    "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
-    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split\n",
+    "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report\n",
     "import pandas as pd\n",
     "import numpy as np\n",
     "random_state = 19\n",
@@ -34,27 +33,43 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Column Timestamp is not useful, dropping it\n",
       "Column Analog Channel 0 has only one unique value, dropping it\n",
       "Column Analog Channel 1 has only one unique value, dropping it\n",
       "Column Analog Channel 2 has only one unique value, dropping it\n",
       "Column Marker Channel has only one unique value, dropping it\n",
+      "----------------------------------------------------------------------\n",
       "X shape: (164078, 24)\n",
-      "y shape: (164078,)\n"
+      "y shape: (164078,)\n",
+      "Labels distribution\n",
+      "===================\n",
+      "Movement\n",
+      "none    44.742744\n",
+      "i       30.171016\n",
+      "d       25.086239\n",
+      "Name: proportion, dtype: float64\n"
      ]
     }
    ],
    "source": [
     "dataset = pd.read_csv(os.path.join(\"csv\", 'openbci.csv'))\n",
     "# Clean the dataset\n",
+    "print(\"Column Timestamp is not useful, dropping it\")\n",
     "dataset = dataset.drop(columns=['Timestamp'])\n",
     "for column in dataset.columns:\n",
     "    if dataset[column].unique().size == 1:\n",
     "        print(f\"Column {column} has only one unique value, dropping it\")\n",
     "        dataset = dataset.drop(columns=[column])\n",
+    "print(\"-\"*70)\n",
+    "# Split the dataset\n",
     "X = dataset.iloc[:, :-1].values\n",
     "y = dataset.iloc[:,-1].values\n",
+    "# Show the characteristics of the dataset\n",
     "print(f\"X shape: {X.shape}\")\n",
-    "print(f\"y shape: {y.shape}\")"
+    "print(f\"y shape: {y.shape}\")\n",
+    "print(\"Labels distribution\")\n",
+    "print(\"===================\")\n",
+    "print(dataset.Movement.value_counts(normalize=True)*100)"
    ]
   },
   {
@@ -79,7 +94,6 @@
    ],
    "source": [
     "clf = RandomForestClassifier(n_estimators=estimators, n_jobs=-1, random_state=random_state)\n",
-    "clf.fit(X,y)\n",
     "cv = StratifiedKFold(n_splits=splits, shuffle=True, random_state=random_state)\n",
     "scores = cross_validate(clf, X, y, scoring=\"accuracy\", cv=cv, n_jobs=-1, return_train_score=True)\n",
     "print(f\"Accuracy: {np.mean(scores['test_score'])} (+/- {np.std(scores['test_score'])})\")"
@@ -89,7 +103,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Confusion Matrix"
+    "# Classification Report"
    ]
   },
   {
@@ -101,16 +115,45 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Accuracy on test set: 0.9939358849341785\n"
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           d  0.9923609 0.9941691 0.9932642      8232\n",
+      "           i  0.9961570 0.9948490 0.9955026      9901\n",
+      "        none  0.9933247 0.9931894 0.9932570     14683\n",
+      "\n",
+      "    accuracy                      0.9939359     32816\n",
+      "   macro avg  0.9939475 0.9940692 0.9940079     32816\n",
+      "weighted avg  0.9939374 0.9939359 0.9939363     32816\n",
+      "\n"
      ]
-    },
+    }
+   ],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random_state, stratify=y)\n",
+    "clf.fit(X_train, y_train)\n",
+    "y_pred = clf.predict(X_test)\n",
+    "print(classification_report(y_test, y_pred, digits=7))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Confusion Matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
     {
      "data": {
       "text/plain": [
-       "<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f510ffb1dd0>"
+       "<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f62df7a5cd0>"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -126,10 +169,6 @@
     }
    ],
    "source": [
-    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random_state, stratify=y)\n",
-    "clf.fit(X_train, y_train)\n",
-    "y_pred = clf.predict(X_test)\n",
-    "print(f\"Accuracy on test set: {np.mean(y_test == y_pred)}\")\n",
     "cm = confusion_matrix(y_test, y_pred)\n",
     "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)\n",
     "disp.plot()"