From e8b04f41e47df923a7d9cf7b1db527db1bc6bfb0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Sun, 29 Nov 2020 23:21:46 +0100
Subject: [PATCH] Add server jobs control scripts

---
 core.txt                       |  22 ++++
 datasets.txt                   |  75 +++++++++-----
 experimentation/.myconfig.dist |   5 +
 hardCore.txt                   |   6 ++
 kite_tutorial.ipynb            | 182 ---------------------------------
 scripts/.gitignore             |   3 +
 scripts/experiment.template    |  16 +++
 scripts/genall.sh              |   9 ++
 scripts/genjobs.sh             |  32 ++++++
 scripts/interactive.sh         |   2 +
 scripts/launchsome.sh          |  40 ++++++++
 scripts/notebook.sh            |   4 +
 scripts/rmscripts.sh           |   4 +
 test_arff.py                   |  48 ---------
 14 files changed, 194 insertions(+), 254 deletions(-)
 create mode 100644 core.txt
 create mode 100644 experimentation/.myconfig.dist
 create mode 100644 hardCore.txt
 delete mode 100644 kite_tutorial.ipynb
 create mode 100644 scripts/.gitignore
 create mode 100644 scripts/experiment.template
 create mode 100755 scripts/genall.sh
 create mode 100755 scripts/genjobs.sh
 create mode 100755 scripts/interactive.sh
 create mode 100755 scripts/launchsome.sh
 create mode 100755 scripts/notebook.sh
 create mode 100755 scripts/rmscripts.sh
 delete mode 100644 test_arff.py

diff --git a/core.txt b/core.txt
new file mode 100644
index 0000000..4894912
--- /dev/null
+++ b/core.txt
@@ -0,0 +1,22 @@
+breast-cancer-wisc-diag
+balance-scale
+breast-cancer-wisc-prog
+cardiotocography-10clases
+cardiotocography-3clases
+cylinder-bands
+dermatology
+ilpd-indian-liver
+ionosphere
+led-display
+mammographic
+oocytes_merluccius_nucleus_4d
+oocytes_merluccius_states_2f
+oocytes_trisopterus_nucleus_2f
+oocytes_trisopterus_states_5b
+pima
+statlog-australian-credit
+statlog-german-credit
+statlog-image
+statlog-vehicle
+tic-tac-toe
+zoo
\ No newline at end of file
diff --git a/datasets.txt b/datasets.txt
index 8be5fc4..54dcfb9 100644
--- a/datasets.txt
+++ b/datasets.txt
@@ -1,24 +1,51 @@
-(#) Dataset   Samples  Feat. #Cl. y typ X type  f_type
-=== ========= ======== ===== ==== ===== ======= ======
-#01 breast         683     9    2 int16 int16   csv
-#02 cardiotoc    2,126    41    3 int16 int16   csv
-#03 cod-rna    331,152     8    2 int16 float16 sparse
-#04 connect4    67,557   126    3 int16 int16   sparse
-#05 covtype    581,012    54    7 int16 int16   npz
-#06 diabetes       768     8    2 int16 float16 csv
-#07 dna          3,186   180    3 int16 float16 csv
-#08 fourclass      862     2    2 int16 int16   sparse
-#09 glass          214     9    6 int16 float16 csv
-#10 heart          270    13    2 int16 float16 csv
-#11 ijcnn1     141,691    22    2 int16 float16 sparse
-#12 iris           150     4    3 int16 float16 csv
-#13 letter      20,000    16   26 int16 int16   npz
-#14 mnist       70,000   784   10 int16 int16   npy
-#15 pendigits   10,992    16   10 int16 int16   npy
-#16 protein     24,387   357    3 int16 float16 sparse
-#17 satimage     6,435    36    6 int16 int16   npy
-#18 segment      2,310    19    7 int16 float16 sparse
-#19 shuttle     58,000     9    7 int16 int16   npy
-#20 usps         9,298   256   10 int16 float16 npz
-#21 vehicle        846    18    4 int16 float16 sparse
-#22 wine           178    13    3 int16 float16 csv
\ No newline at end of file
+(#) Dataset                       Samples  Feat. #Cl. y typ X type  f_type
+=== ============================= ======== ===== ==== ===== ======= ======
+#01 balance-scale                      625     4    3 int64 float64 Rdat
+#02 balloons                            16     4    2 int64 float64 Rdat
+#03 breast-cancer-wisc-diag            569    30    2 int64 float64 Rdat
+#04 breast-cancer-wisc-prog            198    33    2 int64 float64 Rdat
+#05 breast-cancer-wisc                 699     9    2 int64 float64 Rdat
+#06 breast-cancer                      286     9    2 int64 float64 Rdat
+#07 cardiotocography-10clases        2,126    21   10 int64 float64 Rdat
+#08 cardiotocography-3clases         2,126    21    3 int64 float64 Rdat
+#09 conn-bench-sonar-mines-rocks       208    60    2 int64 float64 Rdat
+#10 cylinder-bands                     512    35    2 int64 float64 Rdat
+#11 dermatology                        366    34    6 int64 float64 Rdat
+#12 echocardiogram                     131    10    2 int64 float64 Rdat
+#13 fertility                          100     9    2 int64 float64 Rdat
+#14 haberman-survival                  306     3    2 int64 float64 Rdat
+#15 heart-hungarian                    294    12    2 int64 float64 Rdat
+#16 hepatitis                          155    19    2 int64 float64 Rdat
+#17 ilpd-indian-liver                  583     9    2 int64 float64 Rdat
+#18 ionosphere                         351    33    2 int64 float64 Rdat
+#19 iris                               150     4    3 int64 float64 Rdat
+#20 led-display                      1,000     7   10 int64 float64 Rdat
+#21 libras                             360    90   15 int64 float64 Rdat
+#22 low-res-spect                      531   100    9 int64 float64 Rdat
+#23 lymphography                       148    18    4 int64 float64 Rdat
+#24 mammographic                       961     5    2 int64 float64 Rdat
+#25 molec-biol-promoter                106    57    2 int64 float64 Rdat
+#26 musk-1                             476   166    2 int64 float64 Rdat
+#27 oocytes_merluccius_nucleus_4d    1,022    41    2 int64 float64 Rdat
+#28 oocytes_merluccius_states_2f     1,022    25    3 int64 float64 Rdat
+#29 oocytes_trisopterus_nucleus_2f     912    25    2 int64 float64 Rdat
+#30 oocytes_trisopterus_states_5b      912    32    3 int64 float64 Rdat
+#31 parkinsons                         195    22    2 int64 float64 Rdat
+#32 pima                               768     8    2 int64 float64 Rdat
+#33 pittsburg-bridges-MATERIAL         106     7    3 int64 float64 Rdat
+#34 pittsburg-bridges-REL-L            103     7    3 int64 float64 Rdat
+#35 pittsburg-bridges-SPAN              92     7    3 int64 float64 Rdat
+#36 pittsburg-bridges-T-OR-D           102     7    2 int64 float64 Rdat
+#37 planning                           182    12    2 int64 float64 Rdat
+#38 post-operative                      90     8    3 int64 float64 Rdat
+#39 seeds                              210     7    3 int64 float64 Rdat
+#40 statlog-australian-credit          690    14    2 int64 float64 Rdat
+#41 statlog-german-credit            1,000    24    2 int64 float64 Rdat
+#42 statlog-heart                      270    13    2 int64 float64 Rdat
+#43 statlog-image                    2,310    18    7 int64 float64 Rdat
+#44 statlog-vehicle                    846    18    4 int64 float64 Rdat
+#45 synthetic-control                  600    60    6 int64 float64 Rdat
+#46 tic-tac-toe                        958     9    2 int64 float64 Rdat
+#47 vertebral-column-2clases           310     6    2 int64 float64 Rdat
+#48 wine                               178    13    3 int64 float64 Rdat
+#49 zoo                                101    16    7 int64 float64 Rdat
\ No newline at end of file
diff --git a/experimentation/.myconfig.dist b/experimentation/.myconfig.dist
new file mode 100644
index 0000000..6246c89
--- /dev/null
+++ b/experimentation/.myconfig.dist
@@ -0,0 +1,5 @@
+host=<server>
+port=3306
+user=stree
+password=<password>
+database=stree_experiments
\ No newline at end of file
diff --git a/hardCore.txt b/hardCore.txt
new file mode 100644
index 0000000..68cc89d
--- /dev/null
+++ b/hardCore.txt
@@ -0,0 +1,6 @@
+molec-biol-promoter
+musk-1
+conn-bench-sonar-mines-rocks
+libras
+low-res-spect
+synthetic-control
\ No newline at end of file
diff --git a/kite_tutorial.ipynb b/kite_tutorial.ipynb
deleted file mode 100644
index ddc3764..0000000
--- a/kite_tutorial.ipynb
+++ /dev/null
@@ -1,182 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "![Logo](https://kite.com/kite-public/kite-plus-jlab-scaled.png)\n",
-    "\n",
-    "### Welcome to Kite's JupyterLab extension tutorial\n",
-    "\n",
-    "Kite gives you **ML-powered autocompletions** and **rich documentation** inside JupyterLab. This guide will teach you everything you need to know about Kite in 5 minutes or less.\n",
-    "\n",
-    "> 💡 _**Tip:** You can open this file at any time with the command `Kite: Open Tutorial` in JupyterLab's command palette._\n",
-    "\n",
-    "#### Before we start...\n",
-    "\n",
-    "Make sure that the Kite icon at the bottom of the window reads `Kite: ready`.\n",
-    "\n",
-    "![Kite icon](https://kite.com/kite-public/kite-status.png)\n",
-    "\n",
-    "* If it says `Kite: not running`, please start the Kite Engine first.\n",
-    "* If it says `Kite: not installed`, please [download and install Kite](https://kite.com/download) first."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Part 1: Autocompletions\n",
-    "\n",
-    "**Step 1a**<br/>\n",
-    "Run the code cell below with all the necessary imports 👇"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Run me!\n",
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "import pandas as pd"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Step 1b**<br/>\n",
-    "Let's try typing out some code to plot a sine graph. As you type, Kite will automatically show you completions for what you're going to type next.\n",
-    "\n",
-    "![Autocompletions](https://www.kite.com/kite-public/kite-jlab-autocompletions.gif)\n",
-    "\n",
-    "> 💡 _**Tip:** You can turn completions docs on and off in JupyterLab's command palette with the command `Kite: Toggle Docs Panel`._\n",
-    "\n",
-    "> 💡 _**Tip:** The starred completions ★ are from Kite Pro. You can [start your free Kite Pro trial](https://www.kite.com/pro/trial/) anytime. Afterwards, if you choose not to upgrade, you can still use Kite 100% for free._\n",
-    "\n",
-    "Try typing out the code yourself to see Kite's autocompletions in action.<br/>\n",
-    "\n",
-    "```python\n",
-    "x = np.linspace(-np.pi, np.pi, 50)\n",
-    "y = np.sin(x)\n",
-    "plt.plot(x, y)\n",
-    "```\n",
-    "\n",
-    "Type this code in the cell below 👇"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Put code in me\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Part 2: Manual completions\n",
-    "\n",
-    "You can still use JupyterLab's builtin kernel completions. These are particularly useful when you need to access a `DataFrame`'s column names.\n",
-    "\n",
-    "**Step 2a**<br/>\n",
-    "First, run the code cell below to get some sample data to store in a `DataFrame` 👇"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Run me!\n",
-    "url = 'https://kite.com/kite-public/iris.csv'\n",
-    "df = pd.read_csv(url)\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Step 2b**<br/>\n",
-    "Let's plot a scatter graph of sepal length vs. sepal width. When you are accessing a `DataFrame`'s columns, you'll still need to hit `tab` to request completions from the kernel.\n",
-    "\n",
-    "![Manual completions](https://www.kite.com/kite-public/kite-jlab-manual-completions.gif)\n",
-    "\n",
-    "Try requesting kernel completions yourself.\n",
-    "\n",
-    "```python\n",
-    "plt.scatter(df['sepal_length'], df['sepal_width'])\n",
-    "```\n",
-    "\n",
-    "Type this code in the cell below, making sure to hit `tab` when you are filling in the column names 👇"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Put code in me\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Part 3: Copilot Documentation\n",
-    "\n",
-    "If you've enabled \"docs following cursor\" in the Copilot, the Copilot will automatically update with the documentation of the identifier underneath your cursor.\n",
-    "\n",
-    "![Autosearch](https://www.kite.com/kite-public/kite-jlab-autosearch.gif)\n",
-    "\n",
-    "**Step 3a**<br/>\n",
-    "Try it yourself! Just click around in the code cells of this notebook and see the Copilot update automatically."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### The End\n",
-    "\n",
-    "Now you know everything you need to know about Kite's JupyterLab plugin. Kite is under active development and we expect to ship improvements and more features in the near future.\n",
-    "\n",
-    "In the meantime, if you experience bugs or have feature requests, feel free to open an issue in our [public GitHub repo](https://github.com/kiteco/issue-tracker).\n",
-    "\n",
-    "Happy coding!"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/scripts/.gitignore b/scripts/.gitignore
new file mode 100644
index 0000000..f68c005
--- /dev/null
+++ b/scripts/.gitignore
@@ -0,0 +1,3 @@
+gridsearch/*
+gridbest/*
+cross/*
\ No newline at end of file
diff --git a/scripts/experiment.template b/scripts/experiment.template
new file mode 100644
index 0000000..a8a4311
--- /dev/null
+++ b/scripts/experiment.template
@@ -0,0 +1,16 @@
+#!/bin/bash
+### Nombre de trabajo
+#PBS -N <experiment>-<data>-<model>-<kernel>
+### Tiempo máximo de ejecución del trabajo
+#PBS -l walltime=96:00:00
+### Seleccion de cola de trabajos
+#PBS -q workq
+### mezcla errores con la salida principal
+#PBS -j oe
+### Recursos
+#PBS -l select=2:ncpus=2:mem=16Gb
+### Esportar variables de entorno
+#PBS -V
+### Ejecutable con sus parametros
+cd <folder>
+python experiment.py -H galgo -e <experiment> -m <model> -d <data> -S tanveer -k <kernel> -n 1
\ No newline at end of file
diff --git a/scripts/genall.sh b/scripts/genall.sh
new file mode 100755
index 0000000..a687db8
--- /dev/null
+++ b/scripts/genall.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+for i in gridsearch gridbest cross; do
+	echo "*** Building $i experiments"
+	for j in stree odte bagging adaBoost; do
+        for k in linear poly rbf; do
+		    ./genjobs.sh $i $j $k
+        done
+	done
+done
\ No newline at end of file
diff --git a/scripts/genjobs.sh b/scripts/genjobs.sh
new file mode 100755
index 0000000..25782ea
--- /dev/null
+++ b/scripts/genjobs.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+if [ "$1" = "" -o "$2" = "" -o "$3" = "" ] ; then
+	echo "Hay que seleccionar:"
+    echo " - el tipo de experimento {gridsearch, gridbest, cross}"
+    echo " - el modelo {stree, adaBoost, bagging, odte}"
+    echo " - el kernel {linear, poly, rbf, any}"
+	exit 1
+fi
+if [[ ! "gridsearchgridbestcross"  == *$1* ]] ; then
+	echo "Hay que seleccionar el tipo de experimento {gridsearch, gridbest, cross}"
+	exit 1
+fi
+if [[ ! "streeadaBoostbaggingodte"  == *$2* ]] ; then
+	echo "Hay que seleccionar el modelo {stree, adaBoost, bagging, odte}"
+	exit 1
+fi
+if [[ ! "linearpolyrbfany"  == *$3* ]] ; then
+	echo "Hay que seleccionar el kernel {linear, poly, rbf, any}"
+	exit 1
+fi
+script_path="$(pwd)/.."
+cp experiment.template experiment_$1.sh
+perl -i -pe"s/<model>/$2/g" experiment_$1.sh
+perl -i -pe"s~<folder>~$script_path~g" experiment_$1.sh
+perl -i -pe"s/<experiment>/$1/g" experiment_$1.sh
+mkdir -p $1/$2/$3
+cat ../datasets.txt|cut -d " " -f 2|tail -49|while read a; do
+	cp experiment_$1.sh $1/$2/$3/experiment_$a.sh
+	perl -i -pe"s/<data>/$a/g" $1/$2/$3/experiment_$a.sh
+	perl -i -pe"s/<kernel>/$3/g" $1/$2/$3/experiment_$a.sh
+done
+rm experiment_$1.sh
\ No newline at end of file
diff --git a/scripts/interactive.sh b/scripts/interactive.sh
new file mode 100755
index 0000000..568fc25
--- /dev/null
+++ b/scripts/interactive.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+qsub -I -l select=2:ncpus=8:mem=16Gb
\ No newline at end of file
diff --git a/scripts/launchsome.sh b/scripts/launchsome.sh
new file mode 100755
index 0000000..9bf893b
--- /dev/null
+++ b/scripts/launchsome.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+if [ "$1" = "" -o "$2" = "" -o "$3" = "" -o "$4" = "" ] ; then
+	echo "Hay que seleccionar:"
+    echo " - el tipo de experimento {gridsearch, gridbest, cross}"
+    echo " - el modelo {stree, adaBoost, bagging, odte}"
+    echo " - el kernel {linear, poly, rbf, any}"
+	echo " - el archivo con nombres de datasets"
+	echo "opcionalmente al final: dry-run"
+	exit 1
+fi
+if [[ ! "gridsearchgridbestcross"  == *$1* ]] ; then
+	echo "Hay que seleccionar el tipo de experimento {gridsearch, gridbest, cross}"
+	exit 1
+fi
+if [[ ! "streeadaBoostbaggingodte"  == *$2* ]] ; then
+	echo "Hay que seleccionar el modelo {stree, adaBoost, bagging, odte}"
+	exit 1
+fi
+if [[ ! "linearpolyrbfany"  == *$3* ]] ; then
+	echo "Hay que seleccionar el kernel {linear, poly, rbf, any}"
+	exit 1
+fi
+script_path="$(pwd)"
+cd $1/$2/$3
+counter=0
+lines="$(cat $script_path/$4|cut -d " " -f 2|tail -49)"
+for a in $lines; do
+	echo "launch experiment_$a.sh"
+	if [ "$5" = "dry-run" ] ; then
+		echo "not launched"
+	else
+		qsub experiment_$a.sh
+	fi
+	let counter++
+done
+if [ "$5" = "dry-run" ] ; then
+	echo "Not launched $counter jobs"
+else
+	echo "Launched $counter jobs"
+fi
\ No newline at end of file
diff --git a/scripts/notebook.sh b/scripts/notebook.sh
new file mode 100755
index 0000000..65c658d
--- /dev/null
+++ b/scripts/notebook.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+NOTEBOOKPORT=1234
+ssh -N -f -R $NOTEBOOKPORT:localhost:$NOTEBOOKPORT Ricardo.Montanana@galgo.uclm.es
+jupyter lab --port=$NOTEBOOKPORT --no-browser
\ No newline at end of file
diff --git a/scripts/rmscripts.sh b/scripts/rmscripts.sh
new file mode 100755
index 0000000..76ae1d3
--- /dev/null
+++ b/scripts/rmscripts.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+for folder in gridsearch gridbest cross; do
+    find $folder -type f -exec rm {} \;
+done
\ No newline at end of file
diff --git a/test_arff.py b/test_arff.py
deleted file mode 100644
index eab873c..0000000
--- a/test_arff.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import os
-import time
-import numpy as np
-import pandas as pd
-from scipy.io import arff
-from stree import Stree
-from sklearn.model_selection import cross_val_score
-from sklearn.model_selection import train_test_split
-
-folder = (
-    "/Volumes/Datos/OneDrive - Universidad de Castilla-La Mancha/"
-    "Doctorado2019/Compartida/FuentesDescargados/data-4/"
-)
-
-name = "yeast"
-random_state = 1
-
-file_name = os.path.join(folder, name, f"{name}.arff")
-data, meta = arff.loadarff(file_name)
-df = pd.DataFrame(data)
-y = df["clase"].to_numpy().astype(np.int16)
-df.drop(columns="clase", inplace=True)
-X = df.to_numpy().astype(np.float16)
-print(f"Xshape {X.shape} Xtype {X.dtype}")
-print(f"yshape {y.shape} ytype {y.dtype}")
-
-X_train, X_test, y_train, y_test = train_test_split(
-    X, y, test_size=0.3, random_state=random_state
-)
-
-clf = Stree(
-    random_state=random_state,
-    C=1e5,
-    max_iter=1e5,
-    kernel="poly",
-    degree=5,
-    gamma=0.8,
-)
-now = time.time()
-scores = cross_val_score(clf, X, y, cv=5)
-print(f"Accuracy for {name}: {scores.mean():.2f} (+/- {scores.std() * 2:.2f})")
-print(f"Took : {time.time() - now:.2f} seconds")
-print(f"Score one tree all samples .: {clf.fit(X, y).score(X, y):.4f}")
-print(
-    f"Score one tree train/test .: "
-    f"{clf.fit(X_train, y_train).score(X_test, y_test):.4f}"
-)
-print("*" * 80)