15 Commits

Author SHA1 Message Date
9039a634cf Exclude macos-latest with python 3.11 (no torch) 2022-11-13 22:14:01 +01:00
5b5d385b4c Fix uppercase mistake in filename 2022-11-13 20:04:26 +01:00
6ebcc31c36 Add bayesclass to requirements 2022-11-13 18:34:54 +01:00
cd2d803ff5 Update requirements 2022-11-13 18:10:42 +01:00
6aec5b2a97 Add tests to excel in report datasets 2022-11-13 17:44:45 +01:00
f1b9dc1fef Add excel to report dataset 2022-11-13 14:46:41 +01:00
2e6f49de8e Add discretize key to .env.dist 2022-11-12 19:38:14 +01:00
2d61cd11c2 refactor Discretization in datasets 2022-11-12 19:37:46 +01:00
4b442a46f2 Add Discretizer to Datasets 2022-11-10 11:47:01 +01:00
feaf85d0b8 Add Dataset load return a pandas dataframe 2022-11-04 18:40:50 +01:00
0d87e670f7 Disable sonar quality gate in CI
Update base score for Arff STree
2022-11-01 16:53:22 +01:00
1e83db7956 Fix lint errors and update version info 2022-11-01 13:22:53 +01:00
8cf823e843 Add custom seeds to .env 2022-11-01 12:24:50 +01:00
97718e6e82 Add Language and language version to reports 2022-11-01 02:07:24 +01:00
Ricardo Montañana Gómez
5532beb88a Merge pull request #3 from Doctorado-ML/discretiz
Add Arff data source for experiments
Add consistent comparative results to reports
2022-10-25 16:55:04 +02:00
45 changed files with 1029 additions and 68 deletions

View File

@@ -4,3 +4,5 @@ n_folds=5
model=ODTE model=ODTE
stratified=0 stratified=0
source_data=Tanveer source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0

View File

@@ -1,2 +1,3 @@
[flake8] [flake8]
exclude = .git,__init__.py exclude = .git,__init__.py
ignore = E203, W503

View File

@@ -8,7 +8,7 @@ jobs:
name: Build name: Build
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
with: with:
fetch-depth: 0 fetch-depth: 0
- run: echo "project_version=$(git describe --tags --abbrev=0)" >> $GITHUB_ENV - run: echo "project_version=$(git describe --tags --abbrev=0)" >> $GITHUB_ENV
@@ -22,7 +22,8 @@ jobs:
-Dsonar.python.version=3.10 -Dsonar.python.version=3.10
# If you wish to fail your job when the Quality Gate is red, uncomment the # If you wish to fail your job when the Quality Gate is red, uncomment the
# following lines. This would typically be used to fail a deployment. # following lines. This would typically be used to fail a deployment.
- uses: sonarsource/sonarqube-quality-gate-action@master #- uses: sonarsource/sonarqube-quality-gate-action@master
timeout-minutes: 5 # timeout-minutes: 5
env: # env:
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} # SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
# SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }}

View File

@@ -13,10 +13,13 @@ jobs:
strategy: strategy:
matrix: matrix:
os: [macos-latest, ubuntu-latest] os: [macos-latest, ubuntu-latest]
python: ["3.10"] python: ["3.10", "3.11"]
exclude:
- os: macos-latest
python: "3.11"
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python }} - name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v2 uses: actions/setup-python@v2
with: with:

View File

@@ -1,8 +1,10 @@
import os import os
import pandas as pd import pandas as pd
import numpy as np
from scipy.io import arff from scipy.io import arff
from .Utils import Files from .Utils import Files
from .Arguments import EnvData from .Arguments import EnvData
from mdlp.discretization import MDLP
class Diterator: class Diterator:
@@ -24,14 +26,18 @@ class DatasetsArff:
def folder(): def folder():
return "datasets" return "datasets"
def load(self, name, class_name): def load(self, name, class_name, dataframe):
file_name = os.path.join(self.folder(), self.dataset_names(name)) file_name = os.path.join(self.folder(), self.dataset_names(name))
data = arff.loadarff(file_name) data = arff.loadarff(file_name)
df = pd.DataFrame(data[0]) df = pd.DataFrame(data[0])
df = df.dropna() df.dropna(axis=0, how="any", inplace=True)
X = df.drop(class_name, axis=1).to_numpy() X = df.drop(class_name, axis=1)
self.features = X.columns
self.class_name = class_name
y, _ = pd.factorize(df[class_name]) y, _ = pd.factorize(df[class_name])
return X, y df[class_name] = y
X = X.to_numpy()
return df if dataframe else (X, y)
class DatasetsTanveer: class DatasetsTanveer:
@@ -43,7 +49,7 @@ class DatasetsTanveer:
def folder(): def folder():
return "data" return "data"
def load(self, name, _): def load(self, name, *args):
file_name = os.path.join(self.folder(), self.dataset_names(name)) file_name = os.path.join(self.folder(), self.dataset_names(name))
data = pd.read_csv( data = pd.read_csv(
file_name, file_name,
@@ -64,7 +70,7 @@ class DatasetsSurcov:
def folder(): def folder():
return "datasets" return "datasets"
def load(self, name, _): def load(self, name, *args):
file_name = os.path.join(self.folder(), self.dataset_names(name)) file_name = os.path.join(self.folder(), self.dataset_names(name))
data = pd.read_csv( data = pd.read_csv(
file_name, file_name,
@@ -80,15 +86,19 @@ class DatasetsSurcov:
class Datasets: class Datasets:
def __init__(self, dataset_name=None): def __init__(self, dataset_name=None):
envData = EnvData.load() envData = EnvData.load()
class_name = getattr( class_name = getattr(
__import__(__name__), __import__(__name__),
f"Datasets{envData['source_data']}", f"Datasets{envData['source_data']}",
) )
self.load = (
self.load_discretized
if envData["discretize"] == "1"
else self.load_continuous
)
self.dataset = class_name() self.dataset = class_name()
self.class_names = [] self.class_names = []
self.load_names() self._load_names()
if dataset_name is not None: if dataset_name is not None:
try: try:
class_name = self.class_names[ class_name = self.class_names[
@@ -99,7 +109,7 @@ class Datasets:
raise ValueError(f"Unknown dataset: {dataset_name}") raise ValueError(f"Unknown dataset: {dataset_name}")
self.data_sets = [dataset_name] self.data_sets = [dataset_name]
def load_names(self): def _load_names(self):
file_name = os.path.join(self.dataset.folder(), Files.index) file_name = os.path.join(self.dataset.folder(), Files.index)
default_class = "class" default_class = "class"
with open(file_name) as f: with open(file_name) as f:
@@ -115,12 +125,61 @@ class Datasets:
self.data_sets = result self.data_sets = result
self.class_names = class_names self.class_names = class_names
def load(self, name): def get_attributes(self, name):
class Attributes:
pass
X, y = self.load_continuous(name)
attr = Attributes()
values, counts = np.unique(y, return_counts=True)
comp = ""
sep = ""
for count in counts:
comp += f"{sep}{count/sum(counts)*100:5.2f}%"
sep = "/ "
attr.balance = comp
attr.classes = len(np.unique(y))
attr.samples = X.shape[0]
attr.features = X.shape[1]
return attr
def get_features(self):
return self.dataset.features
def get_class_name(self):
return self.dataset.class_name
def load_continuous(self, name, dataframe=False):
try: try:
class_name = self.class_names[self.data_sets.index(name)] class_name = self.class_names[self.data_sets.index(name)]
return self.dataset.load(name, class_name) return self.dataset.load(name, class_name, dataframe)
except (ValueError, FileNotFoundError): except (ValueError, FileNotFoundError):
raise ValueError(f"Unknown dataset: {name}") raise ValueError(f"Unknown dataset: {name}")
def discretize(self, X, y):
"""Supervised discretization with Fayyad and Irani's MDLP algorithm.
Parameters
----------
X : np.ndarray
array (n_samples, n_features) of features
y : np.ndarray
array (n_samples,) of labels
Returns
-------
tuple (X, y) of numpy.ndarray
"""
discretiz = MDLP()
Xdisc = discretiz.fit_transform(X, y)
return Xdisc.astype(int), y.astype(int)
def load_discretized(self, name, dataframe=False):
X, y = self.load_continuous(name)
X, y = self.discretize(X, y)
dataset = pd.DataFrame(X, columns=self.get_features())
dataset[self.get_class_name()] = y
return dataset if dataframe else X, y
def __iter__(self) -> Diterator: def __iter__(self) -> Diterator:
return Diterator(self.data_sets) return Diterator(self.data_sets)

View File

@@ -1,4 +1,5 @@
import os import os
import sys
import json import json
import random import random
import warnings import warnings
@@ -15,10 +16,13 @@ from sklearn.model_selection import (
from .Utils import Folders, Files, NO_RESULTS from .Utils import Folders, Files, NO_RESULTS
from .Datasets import Datasets from .Datasets import Datasets
from .Models import Models from .Models import Models
from .Arguments import EnvData
class Randomized: class Randomized:
seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] @staticmethod
def seeds():
return json.loads(EnvData.load()["seeds"])
class BestResults: class BestResults:
@@ -154,7 +158,7 @@ class Experiment:
self.platform = platform self.platform = platform
self.progress_bar = progress_bar self.progress_bar = progress_bar
self.folds = folds self.folds = folds
self.random_seeds = Randomized.seeds self.random_seeds = Randomized.seeds()
self.results = [] self.results = []
self.duration = 0 self.duration = 0
self._init_experiment() self._init_experiment()
@@ -162,6 +166,10 @@ class Experiment:
def get_output_file(self): def get_output_file(self):
return self.output_file return self.output_file
@staticmethod
def get_python_version():
return "{}.{}".format(sys.version_info.major, sys.version_info.minor)
def _build_classifier(self, random_state, hyperparameters): def _build_classifier(self, random_state, hyperparameters):
self.model = Models.get_model(self.model_name, random_state) self.model = Models.get_model(self.model_name, random_state)
clf = self.model clf = self.model
@@ -193,7 +201,7 @@ class Experiment:
shuffle=True, random_state=random_state, n_splits=self.folds shuffle=True, random_state=random_state, n_splits=self.folds
) )
clf = self._build_classifier(random_state, hyperparameters) clf = self._build_classifier(random_state, hyperparameters)
self.version = clf.version() if hasattr(clf, "version") else "-" self.version = Models.get_version(self.model_name, clf)
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
res = cross_validate( res = cross_validate(
@@ -243,6 +251,8 @@ class Experiment:
output["duration"] = self.duration output["duration"] = self.duration
output["seeds"] = self.random_seeds output["seeds"] = self.random_seeds
output["platform"] = self.platform output["platform"] = self.platform
output["language_version"] = self.get_python_version()
output["language"] = "Python"
output["results"] = self.results output["results"] = self.results
with open(self.output_file, "w") as f: with open(self.output_file, "w") as f:
json.dump(output, f) json.dump(output, f)
@@ -301,7 +311,7 @@ class GridSearch:
self.progress_bar = progress_bar self.progress_bar = progress_bar
self.folds = folds self.folds = folds
self.platform = platform self.platform = platform
self.random_seeds = Randomized.seeds self.random_seeds = Randomized.seeds()
self.grid_file = os.path.join( self.grid_file = os.path.join(
Folders.results, Files.grid_input(score_name, model_name) Folders.results, Files.grid_input(score_name, model_name)
) )

View File

@@ -8,9 +8,12 @@ from sklearn.ensemble import (
) )
from sklearn.svm import SVC from sklearn.svm import SVC
from stree import Stree from stree import Stree
from bayesclass import TAN
from wodt import Wodt from wodt import Wodt
from odte import Odte from odte import Odte
from xgboost import XGBClassifier from xgboost import XGBClassifier
import sklearn
import xgboost
class Models: class Models:
@@ -18,6 +21,7 @@ class Models:
def define_models(random_state): def define_models(random_state):
return { return {
"STree": Stree(random_state=random_state), "STree": Stree(random_state=random_state),
"TAN": TAN(random_state=random_state),
"Cart": DecisionTreeClassifier(random_state=random_state), "Cart": DecisionTreeClassifier(random_state=random_state),
"ExtraTree": ExtraTreeClassifier(random_state=random_state), "ExtraTree": ExtraTreeClassifier(random_state=random_state),
"Wodt": Wodt(random_state=random_state), "Wodt": Wodt(random_state=random_state),
@@ -89,3 +93,15 @@ class Models:
nodes, leaves = result.nodes_leaves() nodes, leaves = result.nodes_leaves()
depth = result.depth_ if hasattr(result, "depth_") else 0 depth = result.depth_ if hasattr(result, "depth_") else 0
return nodes, leaves, depth return nodes, leaves, depth
@staticmethod
def get_version(name, clf):
if hasattr(clf, "version"):
return clf.version()
if name in ["Cart", "ExtraTree", "RandomForest", "GBC", "SVC"]:
return sklearn.__version__
elif name.startswith("Bagging") or name.startswith("AdaBoost"):
return sklearn.__version__
elif name == "XGBoost":
return xgboost.__version__
return "Error"

View File

@@ -1,4 +1,5 @@
import os import os
import sys
from operator import itemgetter from operator import itemgetter
import math import math
import json import json
@@ -17,6 +18,7 @@ from .Utils import (
TextColor, TextColor,
NO_RESULTS, NO_RESULTS,
) )
from ._version import __version__
class BestResultsEver: class BestResultsEver:
@@ -33,7 +35,7 @@ class BestResultsEver:
] ]
self.data["Arff"]["accuracy"] = [ self.data["Arff"]["accuracy"] = [
"STree_default (linear-ovo)", "STree_default (linear-ovo)",
21.9765, 22.063496,
] ]
def get_name_value(self, key, score): def get_name_value(self, key, score):
@@ -196,7 +198,8 @@ class Report(BaseReport):
self._compare_totals = {} self._compare_totals = {}
self.header_line("*") self.header_line("*")
self.header_line( self.header_line(
f" Report {self.data['model']} ver. {self.data['version']}" f" {self.data['model']} ver. {self.data['version']}"
f" {self.data['language']} ver. {self.data['language_version']}"
f" with {self.data['folds']} Folds " f" with {self.data['folds']} Folds "
f"cross validation and {len(self.data['seeds'])} random seeds. " f"cross validation and {len(self.data['seeds'])} random seeds. "
f"{self.data['date']} {self.data['time']}" f"{self.data['date']} {self.data['time']}"
@@ -347,7 +350,8 @@ class Excel(BaseReport):
def get_title(self): def get_title(self):
return ( return (
f" Report {self.data['model']} ver. {self.data['version']}" f" {self.data['model']} ver. {self.data['version']}"
f" {self.data['language']} ver. {self.data['language_version']}"
f" with {self.data['folds']} Folds " f" with {self.data['folds']} Folds "
f"cross validation and {len(self.data['seeds'])} random seeds. " f"cross validation and {len(self.data['seeds'])} random seeds. "
f"{self.data['date']} {self.data['time']}" f"{self.data['date']} {self.data['time']}"
@@ -564,37 +568,251 @@ class Excel(BaseReport):
self.sheet.set_row(c, 20) self.sheet.set_row(c, 20)
self.sheet.set_row(0, 25) self.sheet.set_row(0, 25)
self.sheet.freeze_panes(6, 1) self.sheet.freeze_panes(6, 1)
self.sheet.hide_gridlines() self.sheet.hide_gridlines(2)
if self.close: if self.close:
self.book.close() self.book.close()
class ReportDatasets: class ReportDatasets:
row = 6
# alternate lines colors
color1 = "#DCE6F1"
color2 = "#FDE9D9"
color3 = "#B1A0C7"
def __init__(self, excel=False, book=None):
self.excel = excel
self.env = EnvData().load()
self.close = False
self.output = True
self.header_text = f"Datasets used in benchmark ver. {__version__}"
if excel:
self.max_length = 0
if book is None:
self.excel_file_name = Files.datasets_report_excel
self.book = xlsxwriter.Workbook(
self.excel_file_name, {"nan_inf_to_errors": True}
)
self.set_properties(self.get_title())
self.close = True
else:
self.book = book
self.output = False
self.sheet = self.book.add_worksheet("Datasets")
def set_properties(self, title):
self.book.set_properties(
{
"title": title,
"subject": "Machine learning results",
"author": "Ricardo Montañana Gómez",
"manager": "Dr. J. A. Gámez, Dr. J. M. Puerta",
"company": "UCLM",
"comments": "Created with Python and XlsxWriter",
}
)
@staticmethod @staticmethod
def report(): def get_python_version():
return "{}.{}".format(sys.version_info.major, sys.version_info.minor)
def get_title(self):
return (
f" Benchmark ver. {__version__} - "
f" Python ver. {self.get_python_version()}"
f" with {self.env['n_folds']} Folds cross validation "
f" Discretization: {self.env['discretize']} "
f"Stratification: {self.env['stratified']}"
)
def get_file_name(self):
return self.excel_file_name
def header(self):
merge_format = self.book.add_format(
{
"border": 1,
"bold": 1,
"align": "center",
"valign": "vcenter",
"font_size": 18,
"bg_color": self.color3,
}
)
merge_format_subheader = self.book.add_format(
{
"border": 1,
"bold": 1,
"align": "center",
"valign": "vcenter",
"font_size": 16,
"bg_color": self.color1,
}
)
merge_format_subheader_right = self.book.add_format(
{
"border": 1,
"bold": 1,
"align": "right",
"valign": "vcenter",
"font_size": 16,
"bg_color": self.color1,
}
)
merge_format_subheader_left = self.book.add_format(
{
"border": 1,
"bold": 1,
"align": "left",
"valign": "vcenter",
"font_size": 16,
"bg_color": self.color1,
}
)
self.sheet.merge_range(0, 0, 0, 4, self.header_text, merge_format)
self.sheet.merge_range(
1,
0,
4,
0,
f" Default score {self.env['score']}",
merge_format_subheader,
)
self.sheet.merge_range(
1,
1,
1,
3,
"Cross validation",
merge_format_subheader_right,
)
self.sheet.write(
1, 4, f"{self.env['n_folds']} Folds", merge_format_subheader_left
)
self.sheet.merge_range(
2,
1,
2,
3,
"Stratified",
merge_format_subheader_right,
)
self.sheet.write(
2,
4,
f"{'True' if self.env['stratified']=='1' else 'False'}",
merge_format_subheader_left,
)
self.sheet.merge_range(
3,
1,
3,
3,
"Discretized",
merge_format_subheader_right,
)
self.sheet.write(
3,
4,
f"{'True' if self.env['discretize']=='1' else 'False'}",
merge_format_subheader_left,
)
self.sheet.merge_range(
4,
1,
4,
3,
"Seeds",
merge_format_subheader_right,
)
self.sheet.write(
4, 4, f"{self.env['seeds']}", merge_format_subheader_left
)
self.update_max_length(len(self.env["seeds"]) + 1)
header_cols = [
("Dataset", 30),
("Samples", 10),
("Features", 10),
("Classes", 10),
("Balance", 50),
]
bold = self.book.add_format(
{
"bold": True,
"font_size": 14,
"bg_color": self.color3,
"border": 1,
}
)
i = 0
for item, length in header_cols:
self.sheet.write(5, i, item, bold)
self.sheet.set_column(i, i, length)
i += 1
def footer(self):
# set Balance column width to max length
self.sheet.set_column(4, 4, self.max_length)
self.sheet.freeze_panes(6, 1)
self.sheet.hide_gridlines(2)
if self.close:
self.book.close()
def print_line(self, result):
size_n = 14
integer = self.book.add_format(
{"num_format": "#,###", "font_size": size_n, "border": 1}
)
normal = self.book.add_format({"font_size": size_n, "border": 1})
col = 0
if self.row % 2 == 0:
normal.set_bg_color(self.color1)
integer.set_bg_color(self.color1)
else:
normal.set_bg_color(self.color2)
integer.set_bg_color(self.color2)
self.sheet.write(self.row, col, result.dataset, normal)
self.sheet.write(self.row, col + 1, result.samples, integer)
self.sheet.write(self.row, col + 2, result.features, integer)
self.sheet.write(self.row, col + 3, result.classes, normal)
self.sheet.write(self.row, col + 4, result.balance, normal)
self.update_max_length(len(result.balance))
self.row += 1
def update_max_length(self, value):
if value > self.max_length:
self.max_length = value
def report(self):
data_sets = Datasets() data_sets = Datasets()
color_line = TextColor.LINE1 color_line = TextColor.LINE1
if self.excel:
self.header()
if self.output:
print(color_line, end="") print(color_line, end="")
print(self.header_text)
print("")
print(f"{'Dataset':30s} Sampl. Feat. Cls Balance") print(f"{'Dataset':30s} Sampl. Feat. Cls Balance")
print("=" * 30 + " ===== ====== === " + "=" * 40) print("=" * 30 + " ====== ===== === " + "=" * 60)
for dataset in data_sets: for dataset in data_sets:
X, y = data_sets.load(dataset) attributes = data_sets.get_attributes(dataset)
attributes.dataset = dataset
if self.excel:
self.print_line(attributes)
color_line = ( color_line = (
TextColor.LINE2 TextColor.LINE2
if color_line == TextColor.LINE1 if color_line == TextColor.LINE1
else TextColor.LINE1 else TextColor.LINE1
) )
values, counts = np.unique(y, return_counts=True) if self.output:
comp = ""
sep = ""
for count in counts:
comp += f"{sep}{count/sum(counts)*100:5.2f}%"
sep = "/ "
print(color_line, end="") print(color_line, end="")
print( print(
f"{dataset:30s} {X.shape[0]:6,d} {X.shape[1]:5,d} " f"{dataset:30s} {attributes.samples:6,d} "
f"{len(np.unique(y)):3d} {comp:40s}" f"{attributes.features:5,d} {attributes.classes:3d} "
f"{attributes.balance:40s}"
) )
if self.excel:
self.footer()
class SQL(BaseReport): class SQL(BaseReport):
@@ -1066,7 +1284,12 @@ class Benchmark:
k = Excel(file_name=file_name, book=book) k = Excel(file_name=file_name, book=book)
k.report() k.report()
sheet.freeze_panes(6, 1) sheet.freeze_panes(6, 1)
sheet.hide_gridlines() sheet.hide_gridlines(2)
def add_datasets_sheet():
# Add datasets sheet
re = ReportDatasets(excel=True, book=book)
re.report()
def exreport_output(): def exreport_output():
file_name = os.path.join( file_name = os.path.join(
@@ -1094,6 +1317,7 @@ class Benchmark:
footer() footer()
models_files() models_files()
exreport_output() exreport_output()
add_datasets_sheet()
book.close() book.close()

View File

@@ -1,6 +1,8 @@
import os import os
import sys
import subprocess import subprocess
PYTHON_VERSION = "{}.{}".format(sys.version_info.major, sys.version_info.minor)
NO_RESULTS = "** No results found **" NO_RESULTS = "** No results found **"
NO_ENV = "File .env not found" NO_ENV = "File .env not found"
@@ -25,6 +27,7 @@ class Files:
exreport_pdf = "Rplots.pdf" exreport_pdf = "Rplots.pdf"
benchmark_r = "benchmark.r" benchmark_r = "benchmark.r"
dot_env = ".env" dot_env = ".env"
datasets_report_excel = "ReportDatasets.xlsx"
@staticmethod @staticmethod
def exreport_output(score): def exreport_output(score):

View File

@@ -1,10 +1,16 @@
from .Datasets import Datasets, DatasetsSurcov, DatasetsTanveer, DatasetsArff from .Datasets import (
Datasets,
DatasetsSurcov,
DatasetsTanveer,
DatasetsArff,
)
from .Experiments import Experiment from .Experiments import Experiment
from .Results import Report, Summary from .Results import Report, Summary
from ._version import __version__
__author__ = "Ricardo Montañana Gómez" __author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez" __copyright__ = "Copyright 2020-2023, Ricardo Montañana Gómez"
__license__ = "MIT License" __license__ = "MIT License"
__author_email__ = "ricardo.montanana@alu.uclm.es" __author_email__ = "ricardo.montanana@alu.uclm.es"
__all__ = ["Experiment", "Datasets", "Report", "Summary"] __all__ = ["Experiment", "Datasets", "Report", "Summary", __version__]

1
benchmark/_version Normal file
View File

@@ -0,0 +1 @@
__version__ = "0.7.1"

View File

@@ -1 +1 @@
__version__ = "0.1.1" __version__ = "0.2.0"

View File

@@ -21,7 +21,11 @@ def main(args_test=None):
if args.grid: if args.grid:
args.best = None args.best = None
if args.file is None and args.best is None and args.grid is None: if args.file is None and args.best is None and args.grid is None:
ReportDatasets.report() report = ReportDatasets(args.excel)
report.report()
if args.excel:
is_test = args_test is not None
Files.open(report.get_file_name(), is_test)
else: else:
if args.best is not None or args.grid is not None: if args.best is not None or args.grid is not None:
report = ReportBest(args.score, args.model, args.best, args.grid) report = ReportBest(args.score, args.model, args.best, args.grid)

View File

@@ -5,3 +5,5 @@ model=ODTE
stratified=0 stratified=0
# Source of data Tanveer/Surcov # Source of data Tanveer/Surcov
source_data=Tanveer source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0

View File

@@ -4,3 +4,5 @@ n_folds=5
model=ODTE model=ODTE
stratified=0 stratified=0
source_data=Arff source_data=Arff
seeds=[271, 314, 171]
discretize=1

View File

@@ -5,3 +5,5 @@ model=ODTE
stratified=0 stratified=0
# Source of data Tanveer/Surcov # Source of data Tanveer/Surcov
source_data=Tanveer source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0

View File

@@ -5,3 +5,5 @@ model=ODTE
stratified=0 stratified=0
# Source of data Tanveer/Surcov # Source of data Tanveer/Surcov
source_data=Surcov source_data=Surcov
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0

1
benchmark/tests/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
ReportDatasets.xlsx

View File

@@ -89,6 +89,15 @@ class BenchmarkTest(TestBase):
self.assertTrue(os.path.exists(benchmark.get_tex_file())) self.assertTrue(os.path.exists(benchmark.get_tex_file()))
self.check_file_file(benchmark.get_tex_file(), "exreport_tex") self.check_file_file(benchmark.get_tex_file(), "exreport_tex")
@staticmethod
def generate_excel_sheet(test, sheet, file_name):
with open(os.path.join("test_files", file_name), "w") as f:
for row in range(1, sheet.max_row + 1):
for col in range(1, sheet.max_column + 1):
value = sheet.cell(row=row, column=col).value
if value is not None:
print(f'{row};{col};"{value}"', file=f)
def test_excel_output(self): def test_excel_output(self):
benchmark = Benchmark("accuracy", visualize=False) benchmark = Benchmark("accuracy", visualize=False)
benchmark.compile_results() benchmark.compile_results()
@@ -101,6 +110,3 @@ class BenchmarkTest(TestBase):
for sheet_name in book.sheetnames: for sheet_name in book.sheetnames:
sheet = book[sheet_name] sheet = book[sheet_name]
self.check_excel_sheet(sheet, f"exreport_excel_{sheet_name}") self.check_excel_sheet(sheet, f"exreport_excel_{sheet_name}")
# ExcelTest.generate_excel_sheet(
# self, sheet, f"exreport_excel_{sheet_name}"
# )

View File

@@ -23,7 +23,12 @@ class DatasetTest(TestBase):
def test_Randomized(self): def test_Randomized(self):
expected = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] expected = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
self.assertSequenceEqual(Randomized.seeds, expected) self.assertSequenceEqual(Randomized.seeds(), expected)
def test_Randomized_3_seeds(self):
self.set_env(".env.arff")
expected = [271, 314, 171]
self.assertSequenceEqual(Randomized.seeds(), expected)
def test_Datasets_iterator(self): def test_Datasets_iterator(self):
test = { test = {

View File

@@ -15,6 +15,8 @@ from odte import Odte
from xgboost import XGBClassifier from xgboost import XGBClassifier
from .TestBase import TestBase from .TestBase import TestBase
from ..Models import Models from ..Models import Models
import xgboost
import sklearn
class ModelTest(TestBase): class ModelTest(TestBase):
@@ -33,6 +35,38 @@ class ModelTest(TestBase):
for key, value in test.items(): for key, value in test.items():
self.assertIsInstance(Models.get_model(key), value) self.assertIsInstance(Models.get_model(key), value)
def test_Models_version(self):
def ver_stree():
return "1.2.3"
def ver_wodt():
return "h.j.k"
def ver_odte():
return "4.5.6"
test = {
"STree": [ver_stree, "1.2.3"],
"Wodt": [ver_wodt, "h.j.k"],
"ODTE": [ver_odte, "4.5.6"],
"RandomForest": [None, "7.8.9"],
"BaggingStree": [None, "x.y.z"],
"AdaBoostStree": [None, "w.x.z"],
"XGBoost": [None, "10.11.12"],
}
for key, value in test.items():
clf = Models.get_model(key)
if key in ["STree", "Wodt", "ODTE"]:
clf.version = value[0]
elif key == "XGBoost":
xgboost.__version__ = value[1]
else:
sklearn.__version__ = value[1]
self.assertEqual(Models.get_version(key, clf), value[1])
def test_bogus_Model_Version(self):
self.assertEqual(Models.get_version("unknown", None), "Error")
def test_BaggingStree(self): def test_BaggingStree(self):
clf = Models.get_model("BaggingStree") clf = Models.get_model("BaggingStree")
self.assertIsInstance(clf, BaggingClassifier) self.assertIsInstance(clf, BaggingClassifier)

View File

@@ -178,6 +178,8 @@ class UtilTest(TestBase):
"model": "ODTE", "model": "ODTE",
"stratified": "0", "stratified": "0",
"source_data": "Tanveer", "source_data": "Tanveer",
"seeds": "[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]",
"discretize": "0",
} }
computed = EnvData().load() computed = EnvData().load()
self.assertDictEqual(computed, expected) self.assertDictEqual(computed, expected)

View File

@@ -3,6 +3,8 @@
"title": "Gridsearched hyperparams v022.1b random_init", "title": "Gridsearched hyperparams v022.1b random_init",
"model": "ODTE", "model": "ODTE",
"version": "0.3.2", "version": "0.3.2",
"language_version": "3.11x",
"language": "Python",
"stratified": false, "stratified": false,
"folds": 5, "folds": 5,
"date": "2022-04-20", "date": "2022-04-20",

View File

@@ -3,6 +3,8 @@
"title": "Test default paramters with RandomForest", "title": "Test default paramters with RandomForest",
"model": "RandomForest", "model": "RandomForest",
"version": "-", "version": "-",
"language_version": "3.11x",
"language": "Python",
"stratified": false, "stratified": false,
"folds": 5, "folds": 5,
"date": "2022-01-14", "date": "2022-01-14",

View File

@@ -3,6 +3,8 @@
"model": "STree", "model": "STree",
"stratified": false, "stratified": false,
"folds": 5, "folds": 5,
"language_version": "3.11x",
"language": "Python",
"date": "2021-09-30", "date": "2021-09-30",
"time": "11:42:07", "time": "11:42:07",
"duration": 624.2505249977112, "duration": 624.2505249977112,

View File

@@ -1,6 +1,8 @@
{ {
"score_name": "accuracy", "score_name": "accuracy",
"model": "STree", "model": "STree",
"language": "Python",
"language_version": "3.11x",
"stratified": false, "stratified": false,
"folds": 5, "folds": 5,
"date": "2021-10-27", "date": "2021-10-27",

View File

@@ -1,6 +1,8 @@
{ {
"score_name": "accuracy", "score_name": "accuracy",
"model": "STree", "model": "STree",
"language_version": "3.11x",
"language": "Python",
"stratified": false, "stratified": false,
"folds": 5, "folds": 5,
"date": "2021-11-01", "date": "2021-11-01",

View File

@@ -1,6 +1,6 @@
import os import os
from openpyxl import load_workbook from openpyxl import load_workbook
from ...Utils import Folders from ...Utils import Folders, Files
from ..TestBase import TestBase from ..TestBase import TestBase
@@ -43,6 +43,15 @@ class BeReportTest(TestBase):
self.assertEqual(stderr.getvalue(), "") self.assertEqual(stderr.getvalue(), "")
self.check_output_file(stdout, "report_datasets") self.check_output_file(stdout, "report_datasets")
def test_be_report_datasets_excel(self):
stdout, stderr = self.execute_script("be_report", ["-x", "1"])
self.assertEqual(stderr.getvalue(), "")
self.check_output_file(stdout, "report_datasets")
file_name = os.path.join(os.getcwd(), Files.datasets_report_excel)
book = load_workbook(file_name)
sheet = book["Datasets"]
self.check_excel_sheet(sheet, "exreport_excel_Datasets")
def test_be_report_best(self): def test_be_report_best(self):
stdout, stderr = self.execute_script( stdout, stderr = self.execute_script(
"be_report", ["-s", "accuracy", "-m", "STree", "-b", "1"] "be_report", ["-s", "accuracy", "-m", "STree", "-b", "1"]

View File

@@ -1,5 +1,5 @@
************************************************************************************************************************ ************************************************************************************************************************
* Report STree ver. 1.2.4 with 5 Folds cross validation and 10 random seeds. 2022-05-09 00:15:25 * * STree ver. 1.2.4 Python ver. 3.11x with 5 Folds cross validation and 10 random seeds. 2022-05-09 00:15:25 *
* test * * test *
* Random seeds: [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] Stratified: False * * Random seeds: [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] Stratified: False *
* Execution took 0.80 seconds, 0.00 hours, on iMac27 * * Execution took 0.80 seconds, 0.00 hours, on iMac27 *

View File

@@ -1,5 +1,5 @@
************************************************************************************************************************ ************************************************************************************************************************
* Report STree ver. 1.2.4 with 5 Folds cross validation and 10 random seeds. 2022-05-08 20:14:43 * * STree ver. 1.2.4 Python ver. 3.11x with 5 Folds cross validation and 10 random seeds. 2022-05-08 20:14:43 *
* test * * test *
* Random seeds: [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] Stratified: False * * Random seeds: [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] Stratified: False *
* Execution took 0.48 seconds, 0.00 hours, on iMac27 * * Execution took 0.48 seconds, 0.00 hours, on iMac27 *

View File

@@ -1,5 +1,5 @@
************************************************************************************************************************ ************************************************************************************************************************
* Report STree ver. 1.2.4 with 5 Folds cross validation and 10 random seeds. 2022-05-08 19:38:28 * * STree ver. 1.2.4 Python ver. 3.11x with 5 Folds cross validation and 10 random seeds. 2022-05-08 19:38:28 *
* test * * test *
* Random seeds: [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] Stratified: False * * Random seeds: [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] Stratified: False *
* Execution took 0.06 seconds, 0.00 hours, on iMac27 * * Execution took 0.06 seconds, 0.00 hours, on iMac27 *

View File

@@ -1,5 +1,5 @@
************************************************************************************************************************ ************************************************************************************************************************
* Report STree ver. 1.2.4 with 5 Folds cross validation and 10 random seeds. 2022-05-09 00:21:06 * * STree ver. 1.2.4 Python ver. 3.11x with 5 Folds cross validation and 10 random seeds. 2022-05-09 00:21:06 *
* test * * test *
* Random seeds: [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] Stratified: False * * Random seeds: [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] Stratified: False *
* Execution took 0.89 seconds, 0.00 hours, on iMac27 * * Execution took 0.89 seconds, 0.00 hours, on iMac27 *

View File

@@ -1,4 +1,4 @@
1;1;" Report STree ver. 1.2.3 with 5 Folds cross validation and 10 random seeds. 2021-09-30 11:42:07" 1;1;" STree ver. 1.2.3 Python ver. 3.11x with 5 Folds cross validation and 10 random seeds. 2021-09-30 11:42:07"
2;1;" With gridsearched hyperparameters" 2;1;" With gridsearched hyperparameters"
3;1;" Score is accuracy" 3;1;" Score is accuracy"
3;2;" Execution time" 3;2;" Execution time"

View File

@@ -1,4 +1,4 @@
1;1;" Report ODTE ver. 0.3.2 with 5 Folds cross validation and 10 random seeds. 2022-04-20 10:52:20" 1;1;" ODTE ver. 0.3.2 Python ver. 3.11x with 5 Folds cross validation and 10 random seeds. 2022-04-20 10:52:20"
2;1;" Gridsearched hyperparams v022.1b random_init" 2;1;" Gridsearched hyperparams v022.1b random_init"
3;1;" Score is accuracy" 3;1;" Score is accuracy"
3;2;" Execution time" 3;2;" Execution time"

View File

@@ -1,4 +1,4 @@
1;1;" Report STree ver. 1.2.3 with 5 Folds cross validation and 10 random seeds. 2021-10-27 09:40:40" 1;1;" STree ver. 1.2.3 Python ver. 3.11x with 5 Folds cross validation and 10 random seeds. 2021-10-27 09:40:40"
2;1;" default A" 2;1;" default A"
3;1;" Score is accuracy" 3;1;" Score is accuracy"
3;2;" Execution time" 3;2;" Execution time"

View File

@@ -1,4 +1,4 @@
1;1;" Report STree ver. 1.2.3 with 5 Folds cross validation and 10 random seeds. 2021-09-30 11:42:07" 1;1;" STree ver. 1.2.3 Python ver. 3.11x with 5 Folds cross validation and 10 random seeds. 2021-09-30 11:42:07"
2;1;" With gridsearched hyperparameters" 2;1;" With gridsearched hyperparameters"
3;1;" Score is accuracy" 3;1;" Score is accuracy"
3;2;" Execution time" 3;2;" Execution time"

View File

@@ -0,0 +1,25 @@
1;1;"Datasets used in benchmark ver. 0.2.0"
2;1;" Default score accuracy"
2;2;"Cross validation"
2;5;"5 Folds"
3;2;"Stratified"
3;5;"False"
4;2;"Discretized"
4;5;"False"
5;2;"Seeds"
5;5;"[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]"
6;1;"Dataset"
6;2;"Samples"
6;3;"Features"
6;4;"Classes"
6;5;"Balance"
7;1;"balance-scale"
7;2;"625"
7;3;"4"
7;4;"3"
7;5;" 7.84%/ 46.08%/ 46.08%"
8;1;"balloons"
8;2;"16"
8;3;"4"
8;4;"2"
8;5;"56.25%/ 43.75%"

View File

@@ -1,4 +1,4 @@
1;1;" Report ODTE ver. 0.3.2 with 5 Folds cross validation and 10 random seeds. 2022-04-20 10:52:20" 1;1;" ODTE ver. 0.3.2 Python ver. 3.11x with 5 Folds cross validation and 10 random seeds. 2022-04-20 10:52:20"
2;1;" Gridsearched hyperparams v022.1b random_init" 2;1;" Gridsearched hyperparams v022.1b random_init"
3;1;" Score is accuracy" 3;1;" Score is accuracy"
3;2;" Execution time" 3;2;" Execution time"

View File

@@ -1,4 +1,4 @@
1;1;" Report RandomForest ver. - with 5 Folds cross validation and 10 random seeds. 2022-01-14 12:39:30" 1;1;" RandomForest ver. - Python ver. 3.11x with 5 Folds cross validation and 10 random seeds. 2022-01-14 12:39:30"
2;1;" Test default paramters with RandomForest" 2;1;" Test default paramters with RandomForest"
3;1;" Score is accuracy" 3;1;" Score is accuracy"
3;2;" Execution time" 3;2;" Execution time"

View File

@@ -1,4 +1,4 @@
1;1;" Report STree ver. 1.2.3 with 5 Folds cross validation and 10 random seeds. 2021-09-30 11:42:07" 1;1;" STree ver. 1.2.3 Python ver. 3.11x with 5 Folds cross validation and 10 random seeds. 2021-09-30 11:42:07"
2;1;" With gridsearched hyperparameters" 2;1;" With gridsearched hyperparameters"
3;1;" Score is accuracy" 3;1;" Score is accuracy"
3;2;" Execution time" 3;2;" Execution time"

View File

@@ -1,5 +1,5 @@
************************************************************************************************************************ ************************************************************************************************************************
* Report STree ver. 1.2.3 with 5 Folds cross validation and 10 random seeds. 2021-09-30 11:42:07 * * STree ver. 1.2.3 Python ver. 3.11x with 5 Folds cross validation and 10 random seeds. 2021-09-30 11:42:07 *
* With gridsearched hyperparameters * * With gridsearched hyperparameters *
* Random seeds: [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] Stratified: False * * Random seeds: [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] Stratified: False *
* Execution took 624.25 seconds, 0.17 hours, on iMac27 * * Execution took 624.25 seconds, 0.17 hours, on iMac27 *

View File

@@ -1,5 +1,5 @@
************************************************************************************************************************ ************************************************************************************************************************
* Report STree ver. 1.2.3 with 5 Folds cross validation and 10 random seeds. 2021-09-30 11:42:07 * * STree ver. 1.2.3 Python ver. 3.11x with 5 Folds cross validation and 10 random seeds. 2021-09-30 11:42:07 *
* With gridsearched hyperparameters * * With gridsearched hyperparameters *
* Random seeds: [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] Stratified: False * * Random seeds: [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] Stratified: False *
* Execution took 624.25 seconds, 0.17 hours, on iMac27 * * Execution took 624.25 seconds, 0.17 hours, on iMac27 *

View File

@@ -1,4 +1,6 @@
Dataset Sampl. Feat. Cls Balance Datasets used in benchmark ver. 0.2.0
============================== ===== ====== === ========================================
Dataset Sampl. Feat. Cls Balance
============================== ====== ===== === ============================================================
balance-scale 625 4 3 7.84%/ 46.08%/ 46.08% balance-scale 625 4 3 7.84%/ 46.08%/ 46.08%
balloons 16 4 2 56.25%/ 43.75% balloons 16 4 2 56.25%/ 43.75%

View File

@@ -2,7 +2,10 @@ pandas
scikit-learn scikit-learn
scipy scipy
odte odte
cython
mdlp-discretization
mufs mufs
bayesclass @ git+ssh://git@github.com/doctorado-ml/bayesclass.git
xlsxwriter xlsxwriter
openpyxl openpyxl
tqdm tqdm

526
weka_test.ipynb Normal file
View File

@@ -0,0 +1,526 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "0e48f7d2-7481-4eca-9c38-56d21c203093",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"DEBUG:weka.core.jvm:Adding bundled jars\n",
"DEBUG:weka.core.jvm:Classpath=['/Users/rmontanana/miniconda3/envs/pyweka/lib/python3.10/site-packages/javabridge/jars/rhino-1.7R4.jar', '/Users/rmontanana/miniconda3/envs/pyweka/lib/python3.10/site-packages/javabridge/jars/runnablequeue.jar', '/Users/rmontanana/miniconda3/envs/pyweka/lib/python3.10/site-packages/javabridge/jars/cpython.jar', '/Users/rmontanana/miniconda3/envs/pyweka/lib/python3.10/site-packages/weka/lib/python-weka-wrapper.jar', '/Users/rmontanana/miniconda3/envs/pyweka/lib/python3.10/site-packages/weka/lib/weka.jar']\n",
"DEBUG:weka.core.jvm:MaxHeapSize=default\n",
"DEBUG:weka.core.jvm:Package support disabled\n",
"WARNING: An illegal reflective access operation has occurred\n",
"WARNING: Illegal reflective access by weka.core.WekaPackageClassLoaderManager (file:/Users/rmontanana/miniconda3/envs/pyweka/lib/python3.10/site-packages/weka/lib/weka.jar) to method java.lang.ClassLoader.defineClass(java.lang.String,byte[],int,int,java.security.ProtectionDomain)\n",
"WARNING: Please consider reporting this to the maintainers of weka.core.WekaPackageClassLoaderManager\n",
"WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
"WARNING: All illegal access operations will be denied in a future release\n"
]
}
],
"source": [
"import weka.core.jvm as jvm\n",
"jvm.start()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2ac4e479-3818-4562-a967-bb303d8dd573",
"metadata": {},
"outputs": [],
"source": [
"from weka.core.converters import Loader\n",
"data_dir = \"/Users/rmontanana/Code/discretizbench/datasets/\"\n",
"loader = Loader(classname=\"weka.core.converters.ArffLoader\")\n",
"data = loader.load_file(data_dir + \"iris.arff\")\n",
"data.class_is_last()\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ceb9f912-db42-4cbc-808f-48b5a9d89d44",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"@relation iris\n",
"\n",
"@attribute sepallength numeric\n",
"@attribute sepalwidth numeric\n",
"@attribute petallength numeric\n",
"@attribute petalwidth numeric\n",
"@attribute class {Iris-setosa,Iris-versicolor,Iris-virginica}\n",
"\n",
"@data\n",
"5.1,3.5,1.4,0.2,Iris-setosa\n",
"4.9,3,1.4,0.2,Iris-setosa\n",
"4.7,3.2,1.3,0.2,Iris-setosa\n",
"4.6,3.1,1.5,0.2,Iris-setosa\n",
"5,3.6,1.4,0.2,Iris-setosa\n",
"5.4,3.9,1.7,0.4,Iris-setosa\n",
"4.6,3.4,1.4,0.3,Iris-setosa\n",
"5,3.4,1.5,0.2,Iris-setosa\n",
"4.4,2.9,1.4,0.2,Iris-setosa\n",
"4.9,3.1,1.5,0.1,Iris-setosa\n",
"5.4,3.7,1.5,0.2,Iris-setosa\n",
"4.8,3.4,1.6,0.2,Iris-setosa\n",
"4.8,3,1.4,0.1,Iris-setosa\n",
"4.3,3,1.1,0.1,Iris-setosa\n",
"5.8,4,1.2,0.2,Iris-setosa\n",
"5.7,4.4,1.5,0.4,Iris-setosa\n",
"5.4,3.9,1.3,0.4,Iris-setosa\n",
"5.1,3.5,1.4,0.3,Iris-setosa\n",
"5.7,3.8,1.7,0.3,Iris-setosa\n",
"5.1,3.8,1.5,0.3,Iris-setosa\n",
"5.4,3.4,1.7,0.2,Iris-setosa\n",
"5.1,3.7,1.5,0.4,Iris-setosa\n",
"4.6,3.6,1,0.2,Iris-setosa\n",
"5.1,3.3,1.7,0.5,Iris-setosa\n",
"4.8,3.4,1.9,0.2,Iris-setosa\n",
"5,3,1.6,0.2,Iris-setosa\n",
"5,3.4,1.6,0.4,Iris-setosa\n",
"5.2,3.5,1.5,0.2,Iris-setosa\n",
"5.2,3.4,1.4,0.2,Iris-setosa\n",
"4.7,3.2,1.6,0.2,Iris-setosa\n",
"4.8,3.1,1.6,0.2,Iris-setosa\n",
"5.4,3.4,1.5,0.4,Iris-setosa\n",
"5.2,4.1,1.5,0.1,Iris-setosa\n",
"5.5,4.2,1.4,0.2,Iris-setosa\n",
"4.9,3.1,1.5,0.1,Iris-setosa\n",
"5,3.2,1.2,0.2,Iris-setosa\n",
"5.5,3.5,1.3,0.2,Iris-setosa\n",
"4.9,3.1,1.5,0.1,Iris-setosa\n",
"4.4,3,1.3,0.2,Iris-setosa\n",
"5.1,3.4,1.5,0.2,Iris-setosa\n",
"5,3.5,1.3,0.3,Iris-setosa\n",
"4.5,2.3,1.3,0.3,Iris-setosa\n",
"4.4,3.2,1.3,0.2,Iris-setosa\n",
"5,3.5,1.6,0.6,Iris-setosa\n",
"5.1,3.8,1.9,0.4,Iris-setosa\n",
"4.8,3,1.4,0.3,Iris-setosa\n",
"5.1,3.8,1.6,0.2,Iris-setosa\n",
"4.6,3.2,1.4,0.2,Iris-setosa\n",
"5.3,3.7,1.5,0.2,Iris-setosa\n",
"5,3.3,1.4,0.2,Iris-setosa\n",
"7,3.2,4.7,1.4,Iris-versicolor\n",
"6.4,3.2,4.5,1.5,Iris-versicolor\n",
"6.9,3.1,4.9,1.5,Iris-versicolor\n",
"5.5,2.3,4,1.3,Iris-versicolor\n",
"6.5,2.8,4.6,1.5,Iris-versicolor\n",
"5.7,2.8,4.5,1.3,Iris-versicolor\n",
"6.3,3.3,4.7,1.6,Iris-versicolor\n",
"4.9,2.4,3.3,1,Iris-versicolor\n",
"6.6,2.9,4.6,1.3,Iris-versicolor\n",
"5.2,2.7,3.9,1.4,Iris-versicolor\n",
"5,2,3.5,1,Iris-versicolor\n",
"5.9,3,4.2,1.5,Iris-versicolor\n",
"6,2.2,4,1,Iris-versicolor\n",
"6.1,2.9,4.7,1.4,Iris-versicolor\n",
"5.6,2.9,3.6,1.3,Iris-versicolor\n",
"6.7,3.1,4.4,1.4,Iris-versicolor\n",
"5.6,3,4.5,1.5,Iris-versicolor\n",
"5.8,2.7,4.1,1,Iris-versicolor\n",
"6.2,2.2,4.5,1.5,Iris-versicolor\n",
"5.6,2.5,3.9,1.1,Iris-versicolor\n",
"5.9,3.2,4.8,1.8,Iris-versicolor\n",
"6.1,2.8,4,1.3,Iris-versicolor\n",
"6.3,2.5,4.9,1.5,Iris-versicolor\n",
"6.1,2.8,4.7,1.2,Iris-versicolor\n",
"6.4,2.9,4.3,1.3,Iris-versicolor\n",
"6.6,3,4.4,1.4,Iris-versicolor\n",
"6.8,2.8,4.8,1.4,Iris-versicolor\n",
"6.7,3,5,1.7,Iris-versicolor\n",
"6,2.9,4.5,1.5,Iris-versicolor\n",
"5.7,2.6,3.5,1,Iris-versicolor\n",
"5.5,2.4,3.8,1.1,Iris-versicolor\n",
"5.5,2.4,3.7,1,Iris-versicolor\n",
"5.8,2.7,3.9,1.2,Iris-versicolor\n",
"6,2.7,5.1,1.6,Iris-versicolor\n",
"5.4,3,4.5,1.5,Iris-versicolor\n",
"6,3.4,4.5,1.6,Iris-versicolor\n",
"6.7,3.1,4.7,1.5,Iris-versicolor\n",
"6.3,2.3,4.4,1.3,Iris-versicolor\n",
"5.6,3,4.1,1.3,Iris-versicolor\n",
"5.5,2.5,4,1.3,Iris-versicolor\n",
"5.5,2.6,4.4,1.2,Iris-versicolor\n",
"6.1,3,4.6,1.4,Iris-versicolor\n",
"5.8,2.6,4,1.2,Iris-versicolor\n",
"5,2.3,3.3,1,Iris-versicolor\n",
"5.6,2.7,4.2,1.3,Iris-versicolor\n",
"5.7,3,4.2,1.2,Iris-versicolor\n",
"5.7,2.9,4.2,1.3,Iris-versicolor\n",
"6.2,2.9,4.3,1.3,Iris-versicolor\n",
"5.1,2.5,3,1.1,Iris-versicolor\n",
"5.7,2.8,4.1,1.3,Iris-versicolor\n",
"6.3,3.3,6,2.5,Iris-virginica\n",
"5.8,2.7,5.1,1.9,Iris-virginica\n",
"7.1,3,5.9,2.1,Iris-virginica\n",
"6.3,2.9,5.6,1.8,Iris-virginica\n",
"6.5,3,5.8,2.2,Iris-virginica\n",
"7.6,3,6.6,2.1,Iris-virginica\n",
"4.9,2.5,4.5,1.7,Iris-virginica\n",
"7.3,2.9,6.3,1.8,Iris-virginica\n",
"6.7,2.5,5.8,1.8,Iris-virginica\n",
"7.2,3.6,6.1,2.5,Iris-virginica\n",
"6.5,3.2,5.1,2,Iris-virginica\n",
"6.4,2.7,5.3,1.9,Iris-virginica\n",
"6.8,3,5.5,2.1,Iris-virginica\n",
"5.7,2.5,5,2,Iris-virginica\n",
"5.8,2.8,5.1,2.4,Iris-virginica\n",
"6.4,3.2,5.3,2.3,Iris-virginica\n",
"6.5,3,5.5,1.8,Iris-virginica\n",
"7.7,3.8,6.7,2.2,Iris-virginica\n",
"7.7,2.6,6.9,2.3,Iris-virginica\n",
"6,2.2,5,1.5,Iris-virginica\n",
"6.9,3.2,5.7,2.3,Iris-virginica\n",
"5.6,2.8,4.9,2,Iris-virginica\n",
"7.7,2.8,6.7,2,Iris-virginica\n",
"6.3,2.7,4.9,1.8,Iris-virginica\n",
"6.7,3.3,5.7,2.1,Iris-virginica\n",
"7.2,3.2,6,1.8,Iris-virginica\n",
"6.2,2.8,4.8,1.8,Iris-virginica\n",
"6.1,3,4.9,1.8,Iris-virginica\n",
"6.4,2.8,5.6,2.1,Iris-virginica\n",
"7.2,3,5.8,1.6,Iris-virginica\n",
"7.4,2.8,6.1,1.9,Iris-virginica\n",
"7.9,3.8,6.4,2,Iris-virginica\n",
"6.4,2.8,5.6,2.2,Iris-virginica\n",
"6.3,2.8,5.1,1.5,Iris-virginica\n",
"6.1,2.6,5.6,1.4,Iris-virginica\n",
"7.7,3,6.1,2.3,Iris-virginica\n",
"6.3,3.4,5.6,2.4,Iris-virginica\n",
"6.4,3.1,5.5,1.8,Iris-virginica\n",
"6,3,4.8,1.8,Iris-virginica\n",
"6.9,3.1,5.4,2.1,Iris-virginica\n",
"6.7,3.1,5.6,2.4,Iris-virginica\n",
"6.9,3.1,5.1,2.3,Iris-virginica\n",
"5.8,2.7,5.1,1.9,Iris-virginica\n",
"6.8,3.2,5.9,2.3,Iris-virginica\n",
"6.7,3.3,5.7,2.5,Iris-virginica\n",
"6.7,3,5.2,2.3,Iris-virginica\n",
"6.3,2.5,5,1.9,Iris-virginica\n",
"6.5,3,5.2,2,Iris-virginica\n",
"6.2,3.4,5.4,2.3,Iris-virginica\n",
"5.9,3,5.1,1.8,Iris-virginica\n"
]
}
],
"source": [
"print(data)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ded59d25-c34c-4fb8-a35f-1162f1218414",
"metadata": {},
"outputs": [],
"source": [
"from weka.classifiers import Classifier\n",
"cls = Classifier(classname=\"weka.classifiers.trees.J48\", options=[\"-C\", \"0.3\"])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "4c82f2ae-4071-4571-9a19-433b98463143",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['-C', '0.3', '-M', '2']\n"
]
}
],
"source": [
"print(cls.options)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "4c5c7893-ebbe-407d-872c-fd0bf41f8dc8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"weka.classifiers.trees.J48 -C 0.3 -M 2\n"
]
}
],
"source": [
"print(cls.to_commandline())"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "7b73c18d-e0b0-469d-8a60-03bae8e01128",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"2: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"3: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"4: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"5: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"6: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"7: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"8: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"9: label index=0.0, class distribution=[0.96326708 0.02223308 0.01449983]\n",
"10: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"11: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"12: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"13: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"14: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"15: label index=0.0, class distribution=[0.9382677 0.03162683 0.03010547]\n",
"16: label index=0.0, class distribution=[0.9382677 0.03162683 0.03010547]\n",
"17: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"18: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"19: label index=0.0, class distribution=[0.9382677 0.03162683 0.03010547]\n",
"20: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"21: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"22: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"23: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"24: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"25: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"26: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"27: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"28: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"29: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"30: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"31: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"32: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"33: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"34: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"35: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"36: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"37: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"38: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"39: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"40: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"41: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"42: label index=0.0, class distribution=[0.96326708 0.02223308 0.01449983]\n",
"43: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"44: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"45: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"46: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"47: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"48: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"49: label index=0.0, class distribution=[0.99688403 0.00188598 0.00122999]\n",
"50: label index=0.0, class distribution=[0.99487322 0.00310305 0.00202373]\n",
"51: label index=1.0, class distribution=[0.00545355 0.97466198 0.01988447]\n",
"52: label index=1.0, class distribution=[0.00545355 0.97466198 0.01988447]\n",
"53: label index=1.0, class distribution=[0.010867 0.52425197 0.46488102]\n",
"54: label index=1.0, class distribution=[0.00725727 0.94287877 0.04986396]\n",
"55: label index=1.0, class distribution=[0.00228744 0.97269152 0.02502104]\n",
"56: label index=1.0, class distribution=[0.00308382 0.98338244 0.01353374]\n",
"57: label index=1.0, class distribution=[0.00545355 0.97466198 0.01988447]\n",
"58: label index=1.0, class distribution=[0.00725727 0.94287877 0.04986396]\n",
"59: label index=1.0, class distribution=[0.00228744 0.97269152 0.02502104]\n",
"60: label index=1.0, class distribution=[0.00725727 0.94287877 0.04986396]\n",
"61: label index=1.0, class distribution=[0.00725727 0.94287877 0.04986396]\n",
"62: label index=1.0, class distribution=[0.00732671 0.98195521 0.01071808]\n",
"63: label index=1.0, class distribution=[0.00308382 0.98338244 0.01353374]\n",
"64: label index=1.0, class distribution=[0.00308382 0.98338244 0.01353374]\n",
"65: label index=1.0, class distribution=[0.00308382 0.98338244 0.01353374]\n",
"66: label index=1.0, class distribution=[0.00545355 0.97466198 0.01988447]\n",
"67: label index=1.0, class distribution=[0.00732671 0.98195521 0.01071808]\n",
"68: label index=1.0, class distribution=[0.00308382 0.98338244 0.01353374]\n",
"69: label index=1.0, class distribution=[0.00228744 0.97269152 0.02502104]\n",
"70: label index=1.0, class distribution=[0.00308382 0.98338244 0.01353374]\n",
"71: label index=2.0, class distribution=[0.00920087 0.06127297 0.92952615]\n",
"72: label index=1.0, class distribution=[0.00308382 0.98338244 0.01353374]\n",
"73: label index=2.0, class distribution=[0.00409632 0.47019227 0.5257114 ]\n",
"74: label index=1.0, class distribution=[0.00308382 0.98338244 0.01353374]\n",
"75: label index=1.0, class distribution=[0.00228744 0.97269152 0.02502104]\n",
"76: label index=1.0, class distribution=[0.00545355 0.97466198 0.01988447]\n",
"77: label index=2.0, class distribution=[0.00409632 0.47019227 0.5257114 ]\n",
"78: label index=1.0, class distribution=[0.010867 0.52425197 0.46488102]\n",
"79: label index=1.0, class distribution=[0.00308382 0.98338244 0.01353374]\n",
"80: label index=1.0, class distribution=[0.00308382 0.98338244 0.01353374]\n",
"81: label index=1.0, class distribution=[0.00725727 0.94287877 0.04986396]\n",
"82: label index=1.0, class distribution=[0.00725727 0.94287877 0.04986396]\n",
"83: label index=1.0, class distribution=[0.00308382 0.98338244 0.01353374]\n",
"84: label index=1.0, class distribution=[0.02353491 0.65433551 0.32212958]\n",
"85: label index=1.0, class distribution=[0.01727259 0.943168 0.03955941]\n",
"86: label index=1.0, class distribution=[0.06513736 0.90310001 0.03176263]\n",
"87: label index=1.0, class distribution=[0.00545355 0.97466198 0.01988447]\n",
"88: label index=1.0, class distribution=[0.00228744 0.97269152 0.02502104]\n",
"89: label index=1.0, class distribution=[0.00732671 0.98195521 0.01071808]\n",
"90: label index=1.0, class distribution=[0.00725727 0.94287877 0.04986396]\n",
"91: label index=1.0, class distribution=[0.00725727 0.94287877 0.04986396]\n",
"92: label index=1.0, class distribution=[0.00732671 0.98195521 0.01071808]\n",
"93: label index=1.0, class distribution=[0.00308382 0.98338244 0.01353374]\n",
"94: label index=1.0, class distribution=[0.00725727 0.94287877 0.04986396]\n",
"95: label index=1.0, class distribution=[0.00308382 0.98338244 0.01353374]\n",
"96: label index=1.0, class distribution=[0.00732671 0.98195521 0.01071808]\n",
"97: label index=1.0, class distribution=[0.00308382 0.98338244 0.01353374]\n",
"98: label index=1.0, class distribution=[0.00228744 0.97269152 0.02502104]\n",
"99: label index=1.0, class distribution=[0.00725727 0.94287877 0.04986396]\n",
"100: label index=1.0, class distribution=[0.00308382 0.98338244 0.01353374]\n",
"101: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"102: label index=2.0, class distribution=[0.01274667 0.02829538 0.95895795]\n",
"103: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"104: label index=2.0, class distribution=[0.00139749 0.01280739 0.98579512]\n",
"105: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"106: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"107: label index=1.0, class distribution=[0.00725727 0.94287877 0.04986396]\n",
"108: label index=2.0, class distribution=[0.00139749 0.01280739 0.98579512]\n",
"109: label index=2.0, class distribution=[0.00139749 0.01280739 0.98579512]\n",
"110: label index=2.0, class distribution=[0.00431289 0.0395258 0.95616131]\n",
"111: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"112: label index=2.0, class distribution=[0.00139749 0.01280739 0.98579512]\n",
"113: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"114: label index=2.0, class distribution=[0.01274667 0.02829538 0.95895795]\n",
"115: label index=2.0, class distribution=[0.01274667 0.02829538 0.95895795]\n",
"116: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"117: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"118: label index=2.0, class distribution=[0.00431289 0.0395258 0.95616131]\n",
"119: label index=2.0, class distribution=[0.00139749 0.01280739 0.98579512]\n",
"120: label index=1.0, class distribution=[0.02353491 0.65433551 0.32212958]\n",
"121: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"122: label index=2.0, class distribution=[0.01274667 0.02829538 0.95895795]\n",
"123: label index=2.0, class distribution=[0.00139749 0.01280739 0.98579512]\n",
"124: label index=2.0, class distribution=[0.00139749 0.01280739 0.98579512]\n",
"125: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"126: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"127: label index=2.0, class distribution=[0.00139749 0.01280739 0.98579512]\n",
"128: label index=2.0, class distribution=[0.00920087 0.06127297 0.92952615]\n",
"129: label index=2.0, class distribution=[0.00139749 0.01280739 0.98579512]\n",
"130: label index=1.0, class distribution=[0.010867 0.52425197 0.46488102]\n",
"131: label index=2.0, class distribution=[0.00139749 0.01280739 0.98579512]\n",
"132: label index=2.0, class distribution=[0.00431289 0.0395258 0.95616131]\n",
"133: label index=2.0, class distribution=[0.00139749 0.01280739 0.98579512]\n",
"134: label index=2.0, class distribution=[0.00409632 0.47019227 0.5257114 ]\n",
"135: label index=1.0, class distribution=[0.02353491 0.65433551 0.32212958]\n",
"136: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"137: label index=2.0, class distribution=[0.00431289 0.0395258 0.95616131]\n",
"138: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"139: label index=2.0, class distribution=[0.00920087 0.06127297 0.92952615]\n",
"140: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"141: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"142: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"143: label index=2.0, class distribution=[0.01274667 0.02829538 0.95895795]\n",
"144: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"145: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"146: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"147: label index=2.0, class distribution=[0.00139749 0.01280739 0.98579512]\n",
"148: label index=2.0, class distribution=[0.00102485 0.02817698 0.97079816]\n",
"149: label index=2.0, class distribution=[0.00431289 0.0395258 0.95616131]\n",
"150: label index=2.0, class distribution=[0.00920087 0.06127297 0.92952615]\n"
]
}
],
"source": [
"from weka.classifiers import Classifier\n",
"cls = Classifier(classname=\"weka.classifiers.bayes.BayesNet\", options=[\"-Q\", \"weka.classifiers.bayes.net.search.local.TAN\"])\n",
"cls.build_classifier(data)\n",
"\n",
"for index, inst in enumerate(data):\n",
" pred = cls.classify_instance(inst)\n",
" dist = cls.distribution_for_instance(inst)\n",
" print(str(index+1) + \": label index=\" + str(pred) + \", class distribution=\" + str(dist))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "0b74f00a-15b3-4177-bb8c-e02ed1a3fd38",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Bayes Network Classifier\n",
"Using ADTree\n",
"#attributes=5 #classindex=4\n",
"Network structure (nodes followed by parents)\n",
"sepallength(3): class \n",
"sepalwidth(3): class petalwidth \n",
"petallength(3): class sepallength \n",
"petalwidth(3): class petallength \n",
"class(3): \n",
"LogScore Bayes: -484.0749140715054\n",
"LogScore BDeu: -653.8524681760015\n",
"LogScore MDL: -654.6252712234647\n",
"LogScore ENTROPY: -499.2955771064808\n",
"LogScore AIC: -561.2955771064808\n",
"\n"
]
},
{
"ename": "OSError",
"evalue": "[Errno 63] File name too long: '<?xml version=\"1.0\"?>\\n<!-- DTD for the XMLBIF 0.3 format -->\\n<!DOCTYPE BIF [\\n\\t<!ELEMENT BIF ( NETWORK )*>\\n\\t <!ATTLIST BIF VERSION CDATA #REQUIRED>\\n\\t<!ELEMENT NETWORK ( NAME, ( PROPERTY | VARIABLE | DEFINITION )* )>\\n\\t<!ELEMENT NAME (#PCDATA)>\\n\\t<!ELEMENT VARIABLE ( NAME, ( OUTCOME | PROPERTY )* ) >\\n\\t <!ATTLIST VARIABLE TYPE (nature|decision|utility) \"nature\">\\n\\t<!ELEMENT OUTCOME (#PCDATA)>\\n\\t<!ELEMENT DEFINITION ( FOR | GIVEN | TABLE | PROPERTY )* >\\n\\t<!ELEMENT FOR (#PCDATA)>\\n\\t<!ELEMENT GIVEN (#PCDATA)>\\n\\t<!ELEMENT TABLE (#PCDATA)>\\n\\t<!ELEMENT PROPERTY (#PCDATA)>\\n]>\\n\\n\\n<BIF VERSION=\"0.3\">\\n<NETWORK>\\n<NAME>iris-weka.filters.supervised.attribute.Discretize-Rfirst-last-precision6-weka.filters.unsupervised.attribute.ReplaceMissingValues</NAME>\\n<VARIABLE TYPE=\"nature\">\\n<NAME>sepallength</NAME>\\n<OUTCOME>&apos;\\\\&apos;(-inf-5.55]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(5.55-6.15]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(6.15-inf)\\\\&apos;&apos;</OUTCOME>\\n</VARIABLE>\\n<VARIABLE TYPE=\"nature\">\\n<NAME>sepalwidth</NAME>\\n<OUTCOME>&apos;\\\\&apos;(-inf-2.95]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(2.95-3.35]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(3.35-inf)\\\\&apos;&apos;</OUTCOME>\\n</VARIABLE>\\n<VARIABLE TYPE=\"nature\">\\n<NAME>petallength</NAME>\\n<OUTCOME>&apos;\\\\&apos;(-inf-2.45]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(2.45-4.75]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(4.75-inf)\\\\&apos;&apos;</OUTCOME>\\n</VARIABLE>\\n<VARIABLE TYPE=\"nature\">\\n<NAME>petalwidth</NAME>\\n<OUTCOME>&apos;\\\\&apos;(-inf-0.8]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(0.8-1.75]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(1.75-inf)\\\\&apos;&apos;</OUTCOME>\\n</VARIABLE>\\n<VARIABLE TYPE=\"nature\">\\n<NAME>class</NAME>\\n<OUTCOME>Iris-setosa</OUTCOME>\\n<OUTCOME>Iris-versicolor</OUTCOME>\\n<OUTCOME>Iris-virginica</OUTCOME>\\n</VARIABLE>\\n<DEFINITION>\\n<FOR>sepallength</FOR>\\n<GIVEN>class</GIVEN>\\n<TABLE>\\n0.9223300970873787 0.06796116504854369 0.009708737864077669 \\n0.22330097087378642 0.4563106796116505 0.32038834951456313 \\n0.02912621359223301 0.20388349514563106 0.7669902912621359 \\n</TABLE>\\n</DEFINITION>\\n<DEFINITION>\\n<FOR>sepalwidth</FOR>\\n<GIVEN>class</GIVEN>\\n<GIVEN>petalwidth</GIVEN>\\n<TABLE>\\n0.04854368932038835 0.3592233009708738 0.5922330097087378 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.6831683168316832 0.2871287128712871 0.0297029702970297 \\n0.2 0.6 0.2 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.6923076923076923 0.23076923076923078 0.07692307692307693 \\n0.3763440860215054 0.5053763440860215 0.11827956989247312 \\n</TABLE>\\n</DEFINITION>\\n<DEFINITION>\\n<FOR>petallength</FOR>\\n<GIVEN>class</GIVEN>\\n<GIVEN>sepallength</GIVEN>\\n<TABLE>\\n0.979381443298969 0.010309278350515464 0.010309278350515464 \\n0.7777777777777778 0.1111111111111111 0.1111111111111111 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.04 0.92 0.04 \\n0.02040816326530612 0.8775510204081632 0.10204081632653061 \\n0.02857142857142857 0.7142857142857143 0.2571428571428571 \\n0.2 0.6 0.2 \\n0.043478260869565216 0.043478260869565216 0.9130434782608695 \\n0.012345679012345678 0.012345679012345678 0.9753086419753086 \\n</TABLE>\\n</DEFINITION>\\n<DEFINITION>\\n<FOR>petalwidth</FOR>\\n<GIVEN>class</GIVEN>\\n<GIVEN>petallength</GIVEN>\\n<TABLE>\\n0.9805825242718447 0.009708737864077669 0.009708737864077669 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.01098901098901099 0.978021978021978 0.01098901098901099 \\n0.06666666666666667 0.7333333333333333 0.2 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.2 0.6 0.2 \\n0.009900990099009901 0.0891089108910891 0.900990099009901 \\n</TABLE>\\n</DEFINITION>\\n<DEFINITION>\\n<FOR>class</FOR>\\n<TABLE>\\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n</TABLE>\\n</DEFINITION>\\n</NETWORK>\\n</BIF>\\n'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn [13], line 9\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mcls\u001b[39m)\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mweka\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mplot\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgraph\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mgraph\u001b[39;00m \u001b[38;5;66;03m# NB: pygraphviz and PIL are required\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[43mgraph\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mplot_dot_graph\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgraph\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/miniconda3/envs/pyweka/lib/python3.10/site-packages/weka/plot/graph.py:49\u001b[0m, in \u001b[0;36mplot_dot_graph\u001b[0;34m(graph, filename)\u001b[0m\n\u001b[1;32m 46\u001b[0m logger\u001b[38;5;241m.\u001b[39merror(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPIL is not installed, cannot display graph plot!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 47\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m---> 49\u001b[0m agraph \u001b[38;5;241m=\u001b[39m \u001b[43mAGraph\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgraph\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 50\u001b[0m agraph\u001b[38;5;241m.\u001b[39mlayout(prog\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdot\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 51\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m filename \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
"File \u001b[0;32m~/miniconda3/envs/pyweka/lib/python3.10/site-packages/pygraphviz/agraph.py:157\u001b[0m, in \u001b[0;36mAGraph.__init__\u001b[0;34m(self, thing, filename, data, string, handle, name, strict, directed, **attr)\u001b[0m\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_owns_handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 155\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m filename \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 156\u001b[0m \u001b[38;5;66;03m# load new graph from file (creates self.handle)\u001b[39;00m\n\u001b[0;32m--> 157\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m string \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 159\u001b[0m \u001b[38;5;66;03m# load new graph from string (creates self.handle)\u001b[39;00m\n\u001b[1;32m 160\u001b[0m \u001b[38;5;66;03m# get the charset from the string to properly encode it for\u001b[39;00m\n\u001b[1;32m 161\u001b[0m \u001b[38;5;66;03m# writing to the temporary file in from_string()\u001b[39;00m\n\u001b[1;32m 162\u001b[0m match \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msearch(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcharset\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms*=\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124ms*\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m([^\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m]+)\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m, string)\n",
"File \u001b[0;32m~/miniconda3/envs/pyweka/lib/python3.10/site-packages/pygraphviz/agraph.py:1243\u001b[0m, in \u001b[0;36mAGraph.read\u001b[0;34m(self, path)\u001b[0m\n\u001b[1;32m 1233\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mread\u001b[39m(\u001b[38;5;28mself\u001b[39m, path):\n\u001b[1;32m 1234\u001b[0m \u001b[38;5;124;03m\"\"\"Read graph from dot format file on path.\u001b[39;00m\n\u001b[1;32m 1235\u001b[0m \n\u001b[1;32m 1236\u001b[0m \u001b[38;5;124;03m path can be a file name or file handle\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1241\u001b[0m \n\u001b[1;32m 1242\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1243\u001b[0m fh \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_fh\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1244\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1245\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_close_handle()\n",
"File \u001b[0;32m~/miniconda3/envs/pyweka/lib/python3.10/site-packages/pygraphviz/agraph.py:1791\u001b[0m, in \u001b[0;36mAGraph._get_fh\u001b[0;34m(self, path, mode)\u001b[0m\n\u001b[1;32m 1789\u001b[0m fh \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpopen(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbzcat \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m path) \u001b[38;5;66;03m# probably not portable\u001b[39;00m\n\u001b[1;32m 1790\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1791\u001b[0m fh \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1792\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwrite\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 1793\u001b[0m \u001b[38;5;66;03m# Note, mode of file handle is unchanged.\u001b[39;00m\n\u001b[1;32m 1794\u001b[0m fh \u001b[38;5;241m=\u001b[39m path\n",
"\u001b[0;31mOSError\u001b[0m: [Errno 63] File name too long: '<?xml version=\"1.0\"?>\\n<!-- DTD for the XMLBIF 0.3 format -->\\n<!DOCTYPE BIF [\\n\\t<!ELEMENT BIF ( NETWORK )*>\\n\\t <!ATTLIST BIF VERSION CDATA #REQUIRED>\\n\\t<!ELEMENT NETWORK ( NAME, ( PROPERTY | VARIABLE | DEFINITION )* )>\\n\\t<!ELEMENT NAME (#PCDATA)>\\n\\t<!ELEMENT VARIABLE ( NAME, ( OUTCOME | PROPERTY )* ) >\\n\\t <!ATTLIST VARIABLE TYPE (nature|decision|utility) \"nature\">\\n\\t<!ELEMENT OUTCOME (#PCDATA)>\\n\\t<!ELEMENT DEFINITION ( FOR | GIVEN | TABLE | PROPERTY )* >\\n\\t<!ELEMENT FOR (#PCDATA)>\\n\\t<!ELEMENT GIVEN (#PCDATA)>\\n\\t<!ELEMENT TABLE (#PCDATA)>\\n\\t<!ELEMENT PROPERTY (#PCDATA)>\\n]>\\n\\n\\n<BIF VERSION=\"0.3\">\\n<NETWORK>\\n<NAME>iris-weka.filters.supervised.attribute.Discretize-Rfirst-last-precision6-weka.filters.unsupervised.attribute.ReplaceMissingValues</NAME>\\n<VARIABLE TYPE=\"nature\">\\n<NAME>sepallength</NAME>\\n<OUTCOME>&apos;\\\\&apos;(-inf-5.55]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(5.55-6.15]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(6.15-inf)\\\\&apos;&apos;</OUTCOME>\\n</VARIABLE>\\n<VARIABLE TYPE=\"nature\">\\n<NAME>sepalwidth</NAME>\\n<OUTCOME>&apos;\\\\&apos;(-inf-2.95]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(2.95-3.35]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(3.35-inf)\\\\&apos;&apos;</OUTCOME>\\n</VARIABLE>\\n<VARIABLE TYPE=\"nature\">\\n<NAME>petallength</NAME>\\n<OUTCOME>&apos;\\\\&apos;(-inf-2.45]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(2.45-4.75]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(4.75-inf)\\\\&apos;&apos;</OUTCOME>\\n</VARIABLE>\\n<VARIABLE TYPE=\"nature\">\\n<NAME>petalwidth</NAME>\\n<OUTCOME>&apos;\\\\&apos;(-inf-0.8]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(0.8-1.75]\\\\&apos;&apos;</OUTCOME>\\n<OUTCOME>&apos;\\\\&apos;(1.75-inf)\\\\&apos;&apos;</OUTCOME>\\n</VARIABLE>\\n<VARIABLE TYPE=\"nature\">\\n<NAME>class</NAME>\\n<OUTCOME>Iris-setosa</OUTCOME>\\n<OUTCOME>Iris-versicolor</OUTCOME>\\n<OUTCOME>Iris-virginica</OUTCOME>\\n</VARIABLE>\\n<DEFINITION>\\n<FOR>sepallength</FOR>\\n<GIVEN>class</GIVEN>\\n<TABLE>\\n0.9223300970873787 0.06796116504854369 0.009708737864077669 \\n0.22330097087378642 0.4563106796116505 0.32038834951456313 \\n0.02912621359223301 0.20388349514563106 0.7669902912621359 \\n</TABLE>\\n</DEFINITION>\\n<DEFINITION>\\n<FOR>sepalwidth</FOR>\\n<GIVEN>class</GIVEN>\\n<GIVEN>petalwidth</GIVEN>\\n<TABLE>\\n0.04854368932038835 0.3592233009708738 0.5922330097087378 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.6831683168316832 0.2871287128712871 0.0297029702970297 \\n0.2 0.6 0.2 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.6923076923076923 0.23076923076923078 0.07692307692307693 \\n0.3763440860215054 0.5053763440860215 0.11827956989247312 \\n</TABLE>\\n</DEFINITION>\\n<DEFINITION>\\n<FOR>petallength</FOR>\\n<GIVEN>class</GIVEN>\\n<GIVEN>sepallength</GIVEN>\\n<TABLE>\\n0.979381443298969 0.010309278350515464 0.010309278350515464 \\n0.7777777777777778 0.1111111111111111 0.1111111111111111 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.04 0.92 0.04 \\n0.02040816326530612 0.8775510204081632 0.10204081632653061 \\n0.02857142857142857 0.7142857142857143 0.2571428571428571 \\n0.2 0.6 0.2 \\n0.043478260869565216 0.043478260869565216 0.9130434782608695 \\n0.012345679012345678 0.012345679012345678 0.9753086419753086 \\n</TABLE>\\n</DEFINITION>\\n<DEFINITION>\\n<FOR>petalwidth</FOR>\\n<GIVEN>class</GIVEN>\\n<GIVEN>petallength</GIVEN>\\n<TABLE>\\n0.9805825242718447 0.009708737864077669 0.009708737864077669 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.01098901098901099 0.978021978021978 0.01098901098901099 \\n0.06666666666666667 0.7333333333333333 0.2 \\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n0.2 0.6 0.2 \\n0.009900990099009901 0.0891089108910891 0.900990099009901 \\n</TABLE>\\n</DEFINITION>\\n<DEFINITION>\\n<FOR>class</FOR>\\n<TABLE>\\n0.3333333333333333 0.3333333333333333 0.3333333333333333 \\n</TABLE>\\n</DEFINITION>\\n</NETWORK>\\n</BIF>\\n'"
]
}
],
"source": [
"from weka.classifiers import Classifier\n",
"\n",
"cls = Classifier(classname=\"weka.classifiers.bayes.BayesNet\", options=[\"-Q\", \"weka.classifiers.bayes.net.search.local.TAN\"])\n",
"cls.build_classifier(data)\n",
"\n",
"print(cls)\n",
"\n",
"import weka.plot.graph as graph # NB: pygraphviz and PIL are required\n",
"graph.plot_dot_graph(cls.graph)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3f59f200-4f23-4add-86ae-6df1494ede82",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}