Using ml_utils functions from the jmspack package

Showing the usage of the following ml_utils functions

  • silence_stdout()

  • plot_decision_boundary()

  • plot_cv_indices()

  • plot_learning_curve()

  • dict_of_models

  • multi_roc_auc_plot()

  • optimize_model()

  • plot_confusion_matrix()

  • summary_performance_metrics_classification()

[1]:
import os
tmp = os.getcwd()
os.chdir(tmp.split("jmspack")[0] + "jmspack")
[2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from jmspack.ml_utils import (
                              plot_decision_boundary,
                              plot_cv_indices,
                              plot_learning_curve,
                              dict_of_models,
                              multi_roc_auc_plot,
                              optimize_model,
                              plot_confusion_matrix,
                              summary_performance_metrics_classification
                          )

from jmspack.utils import JmsColors
from sklearn.model_selection import (
    TimeSeriesSplit,
    KFold,
    ShuffleSplit,
    StratifiedKFold,
    GroupShuffleSplit,
    GroupKFold,
    StratifiedShuffleSplit
)
[3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
[4]:
os.chdir(tmp)
[5]:
if "jms_style_sheet" in plt.style.available:
    _ = plt.style.use("jms_style_sheet")
[6]:
df = sns.load_dataset("iris")

plot_decision_boundary

[7]:
feature_list = df.columns.tolist()[0:2]
target = "species"
X = df[feature_list]
y = df[target]
[8]:
fig, ax = plot_decision_boundary(X=X, y=y)
_images/ml_utils_10_0.png

plot_cv_indices

[9]:
n_points = X.shape[0]
n_splits = 5
uneven_groups = np.sort(np.random.randint(0, 10, n_points))
[10]:
cvs = [TimeSeriesSplit,
    KFold,
    ShuffleSplit,
    StratifiedKFold,
    GroupShuffleSplit,
    GroupKFold,
    StratifiedShuffleSplit]
[11]:
for cv in cvs[0:2]:
    fig, ax = plot_cv_indices(cv(n_splits), X, y.astype("category").cat.codes, uneven_groups, n_splits)
_images/ml_utils_14_0.png
_images/ml_utils_14_1.png

plot_learning_curve

[12]:
fig, ax = plot_learning_curve(X=X, y=y.astype("category").cat.codes)
_images/ml_utils_16_0.png

dict_of_models

[13]:
dict_of_models
[13]:
[{'label': 'Logistic Regression', 'model': LogisticRegression()},
 {'label': 'Gradient Boosting', 'model': GradientBoostingClassifier()},
 {'label': 'K_Neighbors Classifier',
  'model': KNeighborsClassifier(n_neighbors=3)},
 {'label': 'SVM Classifier (linear)',
  'model': SVC(C=0.025, kernel='linear', probability=True)},
 {'label': 'SVM Classifier (Radial Basis Function; RBF)',
  'model': SVC(C=1, gamma=2, probability=True)},
 {'label': 'Gaussian Process Classifier',
  'model': GaussianProcessClassifier(kernel=1**2 * RBF(length_scale=1))},
 {'label': 'Decision Tree (depth=5)',
  'model': DecisionTreeClassifier(max_depth=5)},
 {'label': 'Random Forest Classifier(depth=5)',
  'model': RandomForestClassifier(max_depth=5, max_features=1, n_estimators=10)},
 {'label': 'Multilayer Perceptron (MLP) Classifier',
  'model': MLPClassifier(alpha=1, max_iter=1000)},
 {'label': 'AdaBoost Classifier', 'model': AdaBoostClassifier()},
 {'label': 'Naive Bayes (Gaussian) Classifier', 'model': GaussianNB()},
 {'label': 'Quadratic Discriminant Analysis Classifier',
  'model': QuadraticDiscriminantAnalysis()}]

multi_roc_auc_plot

[14]:
X = df.loc[df[target].isin(df[target].unique().tolist()[1:3]), feature_list]
y = df.loc[df[target].isin(df[target].unique().tolist()[1:3]), target]
[15]:
fig, ax = multi_roc_auc_plot(X=X, y=y.astype("category").cat.codes)
_images/ml_utils_21_0.png

optimize_model

[16]:
feature_list = df.columns.tolist()[0:4]
X = df.loc[df[target].isin(df[target].unique().tolist()[1:3]), feature_list]
y = df.loc[df[target].isin(df[target].unique().tolist()[1:3]), target].astype("category").cat.codes
[17]:
(optimized_estimator,
 feature_ranking,
feature_selected,
feature_importance,
 optimal_params
) = optimize_model(X=X, y=y)
Fitting estimator with 4 features.
Fitting estimator with 3 features.

- Sizes :
- X shape = (100, 4)
- y shape = (100,)
- X_train shape = (70, 4)
- X_test shape = (30, 4)
- y_train shape = (70,)
- y_test shape = (30,)

- Model info :
- Optimal Parameters = {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 1, 'max_features': 'log2', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 40, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
- Selected feature list = ['sepal_length', 'petal_length']
- Accuracy score on test set = 86.7%
[18]:
optimal_params
[18]:
bootstrap ccp_alpha class_weight criterion max_depth max_features max_leaf_nodes max_samples min_impurity_decrease min_samples_leaf min_samples_split min_weight_fraction_leaf n_estimators n_jobs oob_score random_state verbose warm_start
optimal_parameters True 0.0 None gini 1 log2 None None 0.0 1 2 0.0 40 None False None 0 False

plot_confusion_matrix

[19]:
clf = LogisticRegression()
X = df.loc[df[target].isin(df[target].unique().tolist()[1:3]), feature_list]
y = df.loc[df[target].isin(df[target].unique().tolist()[1:3]), target].astype("category").cat.codes

train_set_percentage = 0.6
train_X = X.sample(frac=train_set_percentage, random_state=69420)
train_y = y.loc[train_X.index]
test_X = X.drop(train_X.index, axis=0)
test_y = y.drop(train_X.index, axis=0)
[20]:
_ = clf.fit(X = train_X,
            y = train_y)

pred_y = clf.predict(test_X)
cf_matrix = confusion_matrix(test_y, pred_y)

fig, ax = plot_confusion_matrix(cf=cf_matrix, title="Confusion Matrix classifying flower species")
_images/ml_utils_28_0.png

summary_performance_metrics_classification

[21]:
metrics_df = summary_performance_metrics_classification(model=clf, X_test=test_X, y_true=test_y)
metrics_df
[21]:
TN FP FN TP Accuracy Balanced Accuracy Prevalence Sensitivity Specificity PPV NPV auc Mean AUC (CI 5%-95%) F1
scores 17 1 0 22 0.975 0.972 0.55 1.0 0.944 0.957 1.0 0.995 0.995 (95% CI 0.995-0.995) 0.978