Añade Classification Report

2025-02-18 00:15:12 +01:00
parent eb136549ca
commit ee3b3600f6
1 changed files with 54 additions and 15 deletions
--- a/classifier.ipynb
+++ b/classifier.ipynb
@@ -8,9 +8,8 @@
   "source": [
    "import os\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
-    "from sklearn.model_selection import cross_validate, StratifiedKFold\n",
-    "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
-    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split\n",
+    "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "random_state = 19\n",
@@ -34,27 +33,43 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
+      "Column Timestamp is not useful, dropping it\n",
      "Column Analog Channel 0 has only one unique value, dropping it\n",
      "Column Analog Channel 1 has only one unique value, dropping it\n",
      "Column Analog Channel 2 has only one unique value, dropping it\n",
      "Column Marker Channel has only one unique value, dropping it\n",
+      "----------------------------------------------------------------------\n",
      "X shape: (164078, 24)\n",
-      "y shape: (164078,)\n"
+      "y shape: (164078,)\n",
+      "Labels distribution\n",
+      "===================\n",
+      "Movement\n",
+      "none    44.742744\n",
+      "i       30.171016\n",
+      "d       25.086239\n",
+      "Name: proportion, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "dataset = pd.read_csv(os.path.join(\"csv\", 'openbci.csv'))\n",
    "# Clean the dataset\n",
+    "print(\"Column Timestamp is not useful, dropping it\")\n",
    "dataset = dataset.drop(columns=['Timestamp'])\n",
    "for column in dataset.columns:\n",
    "    if dataset[column].unique().size == 1:\n",
    "        print(f\"Column {column} has only one unique value, dropping it\")\n",
    "        dataset = dataset.drop(columns=[column])\n",
+    "print(\"-\"*70)\n",
+    "# Split the dataset\n",
    "X = dataset.iloc[:, :-1].values\n",
    "y = dataset.iloc[:,-1].values\n",
+    "# Show the characteristics of the dataset\n",
    "print(f\"X shape: {X.shape}\")\n",
-    "print(f\"y shape: {y.shape}\")"
+    "print(f\"y shape: {y.shape}\")\n",
+    "print(\"Labels distribution\")\n",
+    "print(\"===================\")\n",
+    "print(dataset.Movement.value_counts(normalize=True)*100)"
   ]
  },
  {
@@ -79,7 +94,6 @@
   ],
   "source": [
    "clf = RandomForestClassifier(n_estimators=estimators, n_jobs=-1, random_state=random_state)\n",
-    "clf.fit(X,y)\n",
    "cv = StratifiedKFold(n_splits=splits, shuffle=True, random_state=random_state)\n",
    "scores = cross_validate(clf, X, y, scoring=\"accuracy\", cv=cv, n_jobs=-1, return_train_score=True)\n",
    "print(f\"Accuracy: {np.mean(scores['test_score'])} (+/- {np.std(scores['test_score'])})\")"
@@ -89,7 +103,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Confusion Matrix"
+    "# Classification Report"
   ]
  },
  {
@@ -101,16 +115,45 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Accuracy on test set: 0.9939358849341785\n"
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           d  0.9923609 0.9941691 0.9932642      8232\n",
+      "           i  0.9961570 0.9948490 0.9955026      9901\n",
+      "        none  0.9933247 0.9931894 0.9932570     14683\n",
+      "\n",
+      "    accuracy                      0.9939359     32816\n",
+      "   macro avg  0.9939475 0.9940692 0.9940079     32816\n",
+      "weighted avg  0.9939374 0.9939359 0.9939363     32816\n",
+      "\n"
     ]
-    },
+    }
+   ],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random_state, stratify=y)\n",
+    "clf.fit(X_train, y_train)\n",
+    "y_pred = clf.predict(X_test)\n",
+    "print(classification_report(y_test, y_pred, digits=7))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Confusion Matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
    {
     "data": {
      "text/plain": [
-       "<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f510ffb1dd0>"
+       "<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f62df7a5cd0>"
      ]
     },
-     "execution_count": 4,
+     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
@@ -126,10 +169,6 @@
    }
   ],
   "source": [
-    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random_state, stratify=y)\n",
-    "clf.fit(X_train, y_train)\n",
-    "y_pred = clf.predict(X_test)\n",
-    "print(f\"Accuracy on test set: {np.mean(y_test == y_pred)}\")\n",
    "cm = confusion_matrix(y_test, y_pred)\n",
    "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)\n",
    "disp.plot()"