Source code for jmspack.ml_utils

r"""Submodule ml_utils.py includes the following functions:

- plot_decision_boundary(): Generate a simple plot of the decision boundary of a classifier.

- plot_cv_indices(): Visualise the inputted cross validation method in chunks.

- plot_learning_curve(): Plot the learning curve of an estimator as samples increase to evaluate overfitting.

- dict_of_models: A dictionary of useful models.

- multi_roc_auc_plot(): A utility to plot the ROC curves of multiple classifiers (suggested to use in conjunction with the dict_of_models).

- optimize_model(): A utility to run gridsearch and Recursive Feature Elimination on a classifier to return a model with the best parameters.

- plot_confusion_matrix(): Visualise a confusion matrix.

- summary_performance_metrics_classification(): A utility to return a selection of regularly used classification performance metrics.

- RMSE(): Root Mean Squared Error.

"""
import warnings
from typing import Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.linear_model
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch
from sklearn import metrics
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from jmspack.utils import JmsColors


[docs]def plot_decision_boundary( X: pd.DataFrame, y: pd.Series, clf: ClassifierMixin = sklearn.linear_model.LogisticRegression(), title: str = "Decision Boundary Logistic Regression", legend_title: str = "Legend", h: float = 0.05, figsize: tuple = (11.7, 8.27), ): """Generate a simple plot of the decision boundary of a classifier. Parameters ---------- X : array-like, shape (n_samples, n_features) Classifier vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) Target relative to X for classification. Datatype should be integers. clf : scikit-learn algorithm An object that has the `predict` and `predict_proba` methods h : int (default: 0.05) Step size in the mesh title : string Title for the plot. legend_title : string Legend title for the plot. figsize: tuple (default: (11.7, 8.27)) Width and height of the figure in inches Returns ------- boundaries: Figure Properties of the figure can be changed later, e.g. use `boundaries.axes[0].set_ylim(0,100)` to change ylim ax: Axes The axes associated with the boundaries Figure. Examples -------- >>> import seaborn as sns >>> from sklearn.svm import SVC >>> data = sns.load_dataset("iris") >>> # convert the target from string to category to numeric as sklearn cannot handle strings as target >>> y = data["species"] >>> X = data[["sepal_length", "sepal_width"]] >>> clf = SVC(kernel="rbf", gamma=2, C=1, probability=True) >>> _ = plot_decision_boundary(X=X, y=y, clf=clf, title = 'Decision Boundary', legend_title = "Species") """ if X.shape[1] != 2: raise ValueError("X must contains only two features.") if not ( pd.api.types.is_integer_dtype(y) or pd.api.types.is_object_dtype(y) or pd.api.types.is_categorical_dtype(y) ): raise TypeError( "The target variable y can only have the following dtype: [int, object, category]." ) label_0 = X.columns.tolist()[0] label_1 = X.columns.tolist()[1] X = X.copy() y = y.copy() X = X.values y = y.astype("category").cat.codes.values # full_col_list = list(sns.color_palette("husl", len(np.unique(y)))) full_col_list = list(sns.color_palette()) if len(np.unique(y)) > len(full_col_list): raise ValueError( "More labels in the data then colors in the color list. Either reduce the number of labels or expend the color list" ) sub_col_list = full_col_list[0 : len(np.unique(y))] cmap_bold = ListedColormap(sub_col_list) # Try to include a mapping in a later release (+ show categorical labels in the legend) _ = clf.fit(X, y) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) Z_proba = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z_max = Z_proba.max(axis=1) # Take the class with highest probability Z_max = Z_max.reshape(xx.shape) # Put the result into a color plot boundaries, ax = plt.subplots(figsize=figsize) _ = ax.contour(xx, yy, Z, cmap=cmap_bold) _ = ax.scatter( xx, yy, s=(Z_max**2 / h), c=Z, cmap=cmap_bold, alpha=1, edgecolors="none" ) # Plot also the training points training = ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolors="black") _ = plt.xlim(xx.min(), xx.max()) _ = plt.ylim(yy.min(), yy.max()) _ = plt.title(title) _ = plt.subplots_adjust(right=0.8) _ = plt.xlabel(label_0) _ = plt.ylabel(label_1) # Add legend colors leg1 = plt.legend( *training.legend_elements(), frameon=False, fontsize=12, borderaxespad=0, bbox_to_anchor=(1, 0.5), handlelength=2, handletextpad=1, title=legend_title, ) # Add legend sizes l1 = plt.scatter([], [], c="black", s=0.4**2 / h, edgecolors="none") l2 = plt.scatter([], [], c="black", s=0.6**2 / h, edgecolors="none") l3 = plt.scatter([], [], c="black", s=0.8**2 / h, edgecolors="none") l4 = plt.scatter([], [], c="black", s=1**2 / h, edgecolors="none") labels = ["0.4", "0.6", "0.8", "1"] _ = plt.legend( [l1, l2, l3, l4], labels, frameon=False, fontsize=12, borderaxespad=0, bbox_to_anchor=(1, 1), handlelength=2, handletextpad=1, title="Probabilities", scatterpoints=1, ) _ = plt.gca().add_artist(leg1) return boundaries, ax
[docs]def plot_cv_indices(cv, X, y, group, n_splits, lw=10, figsize=(6, 3)): """Create an example plot for indices of a cross-validation object. Parameters ---------- cv : cross-validation generator A scikit-learn cross-validation object with a split method. X : array-like Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like Target relative to X for classification or regression. group : array-like Group relative to X for classification or regression. n_splits : int Number of splits in the cross-validation object. lw : int Line width for the plots. figsize : tuple Width and height of the figure in inches Returns ------- fig: matplotlib.figure.Figure Properties of the figure can be changed later, e.g. use `fig.axes[0].set_ylim(0,100)` to change ylim ax: matplotlib.axes._subplots.AxesSubplot The axes associated with the fig Figure. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import GroupKFold >>> import matplotlib.pyplot as plt >>> from jmspack.ml_utils import plot_cv_indices >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) >>> y = np.array([1, 2, 1, 2]) >>> groups = np.array([0, 0, 2, 2]) >>> group_kfold = GroupKFold(n_splits=2) >>> _ = plot_cv_indices(cv=group_kfold, X=X, y=y, group=groups, n_splits=2, lw=10, figsize=(6, 3)) >>> _ = plt.show() """ # set plotting options cmap_data = plt.cm.Paired cmap_cv = plt.cm.coolwarm fig, ax = plt.subplots(figsize=figsize) # Generate the training/testing visualizations for each CV split for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)): # Fill in indices with the training/test groups indices = np.array([np.nan] * len(X)) indices[tt] = 1 indices[tr] = 0 # Visualize the results ax.scatter( range(len(indices)), [ii + 0.5] * len(indices), c=indices, marker="_", lw=lw, cmap=cmap_cv, vmin=-0.2, vmax=1.2, ) # Plot the data classes and groups at the end ax.scatter( range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data ) ax.scatter( range(len(X)), [ii + 2.5] * len(X), c=group, marker="_", lw=lw, cmap=cmap_data ) # Formatting yticklabels = list(range(n_splits)) + ["class", "group"] ax.set( yticks=np.arange(n_splits + 2) + 0.5, yticklabels=yticklabels, xlabel="Sample index", ylabel="CV iteration", ylim=[n_splits + 2.2, -0.2], xlim=[0, len(X)], ) ax.set_title("{}".format(type(cv).__name__), fontsize=15) ax.legend( [Patch(color=cmap_cv(0.8)), Patch(color=cmap_cv(0.02))], ["Testing set", "Training set"], loc=(1.02, 0.8), ) # Make the legend fit plt.tight_layout() fig.subplots_adjust(right=0.7) return fig, ax
[docs]def plot_learning_curve( X: pd.DataFrame, y: pd.Series, estimator: BaseEstimator = sklearn.linear_model.LogisticRegression(), title: str = "Learning Curve Logistic Regression", groups: Union[None, np.array] = None, cross_color: str = JmsColors.PURPLE, test_color: str = JmsColors.YELLOW, scoring: str = "accuracy", ylim: Union[None, tuple] = None, cv: Union[None, int] = None, n_jobs: int = -1, train_sizes: np.array = np.linspace(0.1, 1.0, 40), figsize: tuple = (10, 5), ): """Generate a simple plot of the test and training learning curve. Parameters ---------- estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. title : string Title for the chart. X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. cross_color : string Signifies the color of the cross validation in the plot test_color : string Signifies the color of the test set in the plot scoring : string Signifies a scoring to evaluate the cross validation ylim : tuple, shape (ymin, ymax), optional Defines minimum and maximum yvalues plotted. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if ``y`` is binary or multiclass, :param groups: :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validators that can be used here. n_jobs : int or None, optional (default=None) Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. train_sizes : array-like, shape (n_ticks,), dtype float or int Relative or absolute numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of the maximum size of the training set (that is determined by the selected validation method), i.e. it has to be within (0, 1]. Otherwise it is interpreted as absolute sizes of the training sets. Note that for classification the number of samples usually have to be big enough to contain at least one sample from each class. (default: np.linspace(0.1, 1.0, 5)) """ fig, ax = plt.subplots(figsize=figsize) _ = plt.title(title) if ylim is not None: plt.ylim(*ylim) _ = plt.xlabel("Training examples") _ = plt.ylabel(scoring) train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, groups=groups, cv=cv, scoring=scoring, n_jobs=n_jobs, train_sizes=train_sizes, random_state=42, ) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) _ = plt.grid() _ = plt.fill_between( train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color=test_color, ) _ = plt.fill_between( train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color=cross_color, ) _ = plt.plot( train_sizes, train_scores_mean, "o-", color=test_color, label="Training score" ) _ = plt.plot( train_sizes, test_scores_mean, "o-", color=cross_color, label="Cross-validation score", ) _ = plt.legend(loc="best") return fig, ax
# create a dictionary of models dict_of_models = [ { "label": "Logistic Regression", "model": LogisticRegression(solver="lbfgs"), }, { "label": "Gradient Boosting", "model": GradientBoostingClassifier(), }, { "label": "K_Neighbors Classifier", "model": KNeighborsClassifier(3), }, { "label": "SVM Classifier (linear)", "model": SVC(kernel="linear", C=0.025, probability=True), }, { "label": "SVM Classifier (Radial Basis Function; RBF)", "model": SVC(kernel="rbf", gamma=2, C=1, probability=True), }, { "label": "Gaussian Process Classifier", "model": GaussianProcessClassifier(1.0 * RBF(1.0)), }, { "label": "Decision Tree (depth=5)", "model": DecisionTreeClassifier(max_depth=5), }, { "label": "Random Forest Classifier(depth=5)", "model": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), }, { "label": "Multilayer Perceptron (MLP) Classifier", "model": MLPClassifier(alpha=1, max_iter=1000), }, { "label": "AdaBoost Classifier", "model": AdaBoostClassifier(), }, { "label": "Naive Bayes (Gaussian) Classifier", "model": GaussianNB(), }, { "label": "Quadratic Discriminant Analysis Classifier", "model": QuadraticDiscriminantAnalysis(), }, ]
[docs]def multi_roc_auc_plot( X: pd.DataFrame, y: pd.Series, models: list = dict_of_models, figsize: tuple = (7, 7), ): """Plot the ROC curves of multiple classifiers. Parameters ---------- X : array-like, shape (n_samples, n_features) Classifier vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) Target relative to X for classification. Datatype should be integers. models : list A list of dictionaries containing the model and the label to be used in the plot. figsize: tuple (default: (7, 7)) Width and height of the figure in inches Returns ------- fig: matplotlib.figure.Figure Properties of the figure can be changed later, e.g. use `fig.axes[0].set_ylim(0,100)` to change ylim ax: matplotlib.axes._subplots.AxesSubplot The axes associated with the fig Figure. Examples -------- >>> import seaborn as sns >>> from jmspack.ml_utils import multi_roc_auc_plot, dict_of_models >>> data = ( ... sns.load_dataset("iris") ... .loc[lambda df: df["species"].isin(["setosa", "virginica"])] ... .replace({"virginica": 0, "setosa": 1}) ... ) >>> y = data["species"] >>> X = data[["sepal_length", "sepal_width"]] >>> _ = multi_roc_auc_plot(X=X, y=y, models=dict_of_models, figsize=(7, 7)) """ # scale the data and create training and test sets of the data X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42 ) fig, ax = plt.subplots(figsize=figsize) # Below for loop iterates through your models list for m in models: model = m["model"] # select the model model.fit(X_train, y_train) # train the model # Compute False postive rate, and True positive rate fpr, tpr, thresholds = metrics.roc_curve( y_test, model.predict_proba(X_test)[:, 1] ) # Calculate Area under the curve to display on the plot auc_score = metrics.roc_auc_score( y_test, model.predict(X_test), average="macro" ) # Now, plot the computed values plt.plot(fpr, tpr, label="%s ROC (area = %0.2f)" % (m["label"], auc_score)) # Custom settings for the plot _ = plt.plot([0, 1], [0, 1], c="grey", ls="--") _ = plt.xlim([0.0, 1.0]) _ = plt.ylim([0.0, 1.05]) _ = plt.xlabel("1-Specificity (False Positive Rate)") _ = plt.ylabel("Sensitivity (True Positive Rate)") _ = plt.title("Receiver Operating Characteristics") _ = plt.legend(loc="lower right") # plt.show() # Display return fig, ax
[docs]def optimize_model( X: pd.DataFrame, y: pd.Series, estimator: BaseEstimator = sklearn.ensemble.RandomForestClassifier(), grid_params_dict: dict = { "max_depth": [1, 2, 3, 4, 5, 10], "n_estimators": [10, 20, 30, 40, 50], "max_features": ["log2", "sqrt"], "criterion": ["gini", "entropy"], }, gridsearch_kwargs: dict = {"scoring": "roc_auc", "cv": 3, "n_jobs": -2}, rfe_kwargs: dict = {"n_features_to_select": 2, "verbose": 1}, ): """A utility to run gridsearch and Recursive Feature Elimination on a classifier to return a model with the best parameters. Parameters ---------- X : array-like, shape (n_samples, n_features) Classifier vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) Target relative to X for classification. Datatype should be integers. estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. grid_params_dict : dict A dictionary of parameters to be used in the gridsearch. gridsearch_kwargs : dict A dictionary of parameters to be used in the gridsearch. rfe_kwargs : dict A dictionary of parameters to be used in the Recursive Feature Elimination. Returns ------- optimized_estimator: sklearn estimator The optimized estimator. feature_ranking: pandas DataFrame A dataframe with features ranking (high = dropped early on). feature_selected: list A list of features selected. feature_importance: pandas DataFrame A dataframe with importances per feature. optimal_parameters: pandas DataFrame A dataframe with the optimal parameters. Examples -------- >>> import seaborn as sns >>> from sklearn.ensemble import RandomForestClassifier >>> from jmspack.ml_utils import optimize_model >>> data = ( ... sns.load_dataset("iris") ... .loc[lambda df: df["species"].isin(["setosa", "virginica"])] ... .replace({"virginica": 0, "setosa": 1}) ... ) >>> y = data["species"] >>> X = data[["sepal_length", "sepal_width"]] >>> model = RandomForestClassifier() >>> ( ... optimized_estimator, ... feature_ranking, ... feature_selected, ... feature_importance, ... optimal_parameters, ... ) = optimize_model(X=X, y=y, estimator=model) """ # Perform a 75% training and 25% test data split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, # stratify=y, random_state=42, ) # Instantiate grid_dt grid_dt = GridSearchCV( estimator=estimator, param_grid=grid_params_dict, **gridsearch_kwargs ) # Optimize hyperparameter _ = grid_dt.fit(X_train, y_train) # Extract the best estimator optimized_estimator = grid_dt.best_estimator_ # Create the RFE with a optimized random forest rfe = RFE(estimator=optimized_estimator, **rfe_kwargs) # Fit the eliminator to the data _ = rfe.fit(X_train, y_train) # create dataframe with features ranking (high = dropped early on) feature_ranking = pd.DataFrame( data=dict(zip(X.columns, rfe.ranking_)), index=np.arange(0, len(X.columns)) ) feature_ranking = feature_ranking.loc[0, :].sort_values() # create dataframe with feature selected feature_selected = X.columns[rfe.support_].to_list() # create dataframe with importances per feature feature_importance = pd.Series( dict(zip(X.columns, optimized_estimator.feature_importances_.round(2))) ) # Calculates the test set accuracy # acc = metrics.accuracy_score(y_test, rfe.predict(X_test)) print("\n- Sizes :") print(f"- X shape = {X.shape}") print(f"- y shape = {y.shape}") print(f"- X_train shape = {X_train.shape}") print(f"- X_test shape = {X_test.shape}") print(f"- y_train shape = {y_train.shape}") print(f"- y_test shape = {y_test.shape}") print("\n- Model info :") print(f"- Optimal Parameters = {optimized_estimator.get_params()}") print(f"- Selected feature list = {feature_selected}") # print("- Accuracy score on test set = {0:.1%}".format(acc)) return ( optimized_estimator, feature_ranking, feature_selected, feature_importance, pd.DataFrame(optimized_estimator.get_params(), index=["optimal_parameters"]), )
[docs]def plot_confusion_matrix( cf, group_names=None, categories="auto", count=True, percent=True, cbar=True, xyticks=True, xyplotlabels=True, sum_stats=True, figsize: tuple = (7, 5), cmap="Blues", title=None, ): """This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization. Parameters ---------- cf: confusion matrix to be passed in group_names: List of strings that represent the labels row by row to be shown in each square. categories: List of strings containing the categories to be displayed on the x,y axis. Default is 'auto' count: If True, show the raw number in the confusion matrix. Default is True. normalize: If True, show the proportions for each category. Default is True. cbar: If True, show the color bar. The cbar values are based off the values in the confusion matrix. Default is True. xyticks: If True, show x and y ticks. Default is True. xyplotlabels: If True, show 'True Label' and 'Predicted Label' on the figure. Default is True. sum_stats: If True, display summary statistics below the figure. Default is True. figsize: Tuple representing the figure size. Default will be the matplotlib rcParams value. cmap: Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues' See http://matplotlib.org/examples/color/colormaps_reference.html title: Title for the heatmap. Default is None. Returns ------- fig: matplotlib.figure.Figure Properties of the figure can be changed later, e.g. use `fig.axes[0].set_ylim(0,100)` to change ylim ax: matplotlib.axes._subplots.AxesSubplot The axes associated with the fig Figure. Examples -------- >>> import seaborn as sns >>> from sklearn.metrics import confusion_matrix >>> from jmspack.ml_utils import plot_confusion_matrix >>> y_true = ["cat", "dog", "cat", "cat", "dog", "bird"] >>> y_pred = ["cat", "cat", "cat", "dog", "bird", "bird"] >>> cf = confusion_matrix(y_true, y_pred, labels=["cat", "dog", "bird"]) >>> _ = plot_confusion_matrix(cf, figsize=(7, 5)) """ fig, ax = plt.subplots(figsize=figsize) # CODE TO GENERATE TEXT INSIDE EACH SQUARE blanks = ["" for i in range(cf.size)] if group_names and len(group_names) == cf.size: group_labels = ["{}\n".format(value) for value in group_names] else: group_labels = blanks if count: group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()] else: group_counts = blanks if percent: group_percentages = [ "{0:.2%}".format(value) for value in cf.flatten() / np.sum(cf) ] else: group_percentages = blanks box_labels = [ f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels, group_counts, group_percentages) ] box_labels = np.asarray(box_labels).reshape(cf.shape[0], cf.shape[1]) # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS if sum_stats: # Accuracy is sum of diagonal divided by total observations accuracy = np.trace(cf) / float(np.sum(cf)) # if it is a binary confusion matrix, show some more stats if len(cf) == 2: # Metrics for Binary Confusion Matrices precision = cf[1, 1] / sum(cf[:, 1]) recall = cf[1, 1] / sum(cf[1, :]) f1_score = 2 * precision * recall / (precision + recall) stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format( accuracy, precision, recall, f1_score ) else: stats_text = "\n\nAccuracy={:0.3f}".format(accuracy) else: stats_text = "" if xyticks == False: # Do not show categories if xyticks is False categories = False # MAKE THE HEATMAP VISUALIZATION _ = sns.heatmap( cf, annot=box_labels, fmt="", cmap=cmap, cbar=cbar, xticklabels=categories, yticklabels=categories, ) if xyplotlabels: _ = plt.ylabel("True label") _ = plt.xlabel("Predicted label" + stats_text) else: _ = plt.xlabel(stats_text) if title: _ = plt.title(title) return fig, ax
def _bootstrap_auc( model, X_test, y_true, use_probabilities, bootstraps, fold_size, random_state ): """Internal function to bootstrap auc. Originates from the AI in healthcare specialization of coursera. https://www.coursera.org/specializations/ai-healthcare Parameters ---------- model: The fitted sklearn model. X_test: pd.Series The predictors used to match to y_true. y_true: pd.Series The actual binary targets. classes: list(str) List with the name of the classes in string format. bootstraps: int The number of bootstraps. fold_size: int The number of folds. Returns ------- list """ if use_probabilities: y_pred_proba = model.predict_proba(X_test)[:, 1] df = pd.DataFrame({"y": y_true, "pred": y_pred_proba}) else: y_pred = model.predict(X_test) df = pd.DataFrame({"y": y_true, "pred": y_pred}) statistics = np.zeros(bootstraps) df_pos = df[df.y == 1] df_neg = df[df.y == 0] prevalence = len(df_pos) / len(df) # get positive examples for stratified sampling for i in range(bootstraps): # stratified sampling of positive and negative examples pos_sample = df_pos.sample( n=int(fold_size * prevalence), replace=True, random_state=random_state ) neg_sample = df_neg.sample( n=int(fold_size * (1 - prevalence)), replace=True, random_state=random_state + 1, ) y_sample = np.concatenate([pos_sample.y.values, neg_sample.y.values]) pred_sample = np.concatenate([pos_sample.pred.values, neg_sample.pred.values]) if use_probabilities: fpr, tpr, thresholds = metrics.roc_curve(y_sample, pred_sample, pos_label=1) score = metrics.auc(fpr, tpr) else: score = metrics.roc_auc_score(y_sample, pred_sample) statistics[i] = score mean = statistics.mean() max_ = np.quantile(statistics, 0.95) min_ = np.quantile(statistics, 0.05) return [f"{mean:.3f} (95% CI {min_:.3f}-{max_:.3f})"]
[docs]def summary_performance_metrics_classification( model, X_test, y_true, bootstraps=100, fold_size=1000, random_state=69420 ): """Summary of different evaluation metrics specific to a single class classification learning problem. Parameters ---------- model: sklearn.model A fitted sklearn model with predict() and predict_proba() methods. X_test: pd.DataFrame A data frame used to run predict the target values (y_pred). y_true: pd.Series or np.arrays Binary true values. bootstraps: int fold_size: int Returns ------- summary_df: pd.DataFrame A dataframe with the summary of the metrics. Notes ----- The function returns the following metrics: - true positive (TP): The model classifies the example as positive, and the actual label also positive. - false positive (FP): The model classifies the example as positive, but the actual label is negative. - true negative (TN): The model classifies the example as negative, and the actual label is also negative. - false negative (FN): The model classifies the example as negative, but the label is actually positive. - accuracy: The fractions of predictions the model got right. - prevalance: The proportion of positive examples. Where y=1. - sensitivity: The probability that our test outputs positive given that the case is actually positive. - specificity: The probability that the test outputs negative given that the case is actually negative. - positive predictive value: The proportion of positive predictions that are true positives. - negative predictive value: The proportion of negative predictions that are true negatives. - auc: A measure of goodness of fit. - bootstrapped auc: The bootstrap estimates the uncertainty by resampling the dataset with replacement. - F1: The harmonic mean of the precision and recall, where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0. Examples -------- >>> import seaborn as sns >>> from sklearn.ensemble import RandomForestClassifier >>> from jmspack.ml_utils import summary_performance_metrics_classification >>> data = ( ... sns.load_dataset("iris") ... .loc[lambda df: df["species"].isin(["setosa", "virginica"])] ... .replace({"virginica": 0, "setosa": 1}) ... ) >>> y = data["species"] >>> X = data[["sepal_length", "sepal_width"]] >>> model = RandomForestClassifier() >>> model.fit(X=X, y=y) >>> summary_df = summary_performance_metrics_classification(model=model, X_test=X, y_true=y) """ y_pred = model.predict(X_test) # check if the fitted model has the "predict_proba" attribute if "predict_proba" in dir(model): # check that the fitted model has the "probability" attribute if "probability" in dir(model): # and that it is set to True (this can be the case for SVC) if model.probability: predict_proba_bool = True y_pred_proba = model.predict_proba(X_test)[:, 1] # auc fpr, tpr, thresholds = metrics.roc_curve( y_true, y_pred_proba, pos_label=1 ) auc_score = metrics.auc(fpr, tpr) else: predict_proba_bool = False warnings.warn( f"The classifier {model.__class__} does have the 'predict_proba' method, however it does not have" f" the 'probability' parameter set to True, hence model evaluation metrics will be based on " f"binary predictions" ) auc_score = metrics.roc_auc_score(y_true, y_pred) else: # the model has "predict_proba and no "probability" boolean so it 100% has "predict_proba" predict_proba_bool = True y_pred_proba = model.predict_proba(X_test)[:, 1] # auc fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred_proba, pos_label=1) auc_score = metrics.auc(fpr, tpr) else: # the model has no "predict_proba" attribute so probabilities are not used predict_proba_bool = False warnings.warn( f"The classifier {model.__class__} does not have the 'predict_proba' method, hence " f"model evaluation metrics will be based on binary predictions" ) auc_score = metrics.roc_auc_score(y_true, y_pred) # bootstrapped auc bootstrap_auc_metric = _bootstrap_auc( model, X_test, y_true, use_probabilities=predict_proba_bool, bootstraps=bootstraps, fold_size=fold_size, random_state=random_state, ) # TP, TN, FP, FN confusion_matrix_metric = metrics.confusion_matrix(y_true, y_pred) TN = confusion_matrix_metric[0][0] FP = confusion_matrix_metric[0][1] FN = confusion_matrix_metric[1][0] TP = confusion_matrix_metric[1][1] # accuracy accuracy_score_metric = metrics.accuracy_score(y_true, y_pred) # balanced accuracy balanced_accuracy_score_metric = metrics.balanced_accuracy_score(y_true, y_pred) # prevalance prevalence = np.mean(y_true == 1) # sensitivity sensitivity = TP / (TP + FN) # specificity specificity = TN / (TN + FP) # positive predictive value PPV = TP / (TP + FP) # negative predictive value NPV = TN / (TN + FN) # F1 f1 = metrics.f1_score(y_true, y_pred) df_metrics = pd.DataFrame( { "TN": TN, "FP": FP, "FN": FN, "TP": TP, "Accuracy": accuracy_score_metric, "Balanced Accuracy": balanced_accuracy_score_metric, "Prevalence": prevalence, "Sensitivity": sensitivity, "Specificity": specificity, "PPV": PPV, "NPV": NPV, "auc": auc_score, "Mean AUC (CI 5%-95%)": bootstrap_auc_metric, "F1": f1, }, index=["scores"], ) return df_metrics.round(3)
[docs]def RMSE(true, pred): """Root Mean Squared Error. Parameters ---------- true: pd.Series The actual values. pred: pd.Series The predicted values. Returns ------- float Examples -------- >>> import pandas as pd >>> from jmspack.ml_utils import RMSE >>> true = pd.Series([1, 2, 5, 4, 5]) >>> pred = pd.Series([1, 2, 3, 4, 5]) >>> RMSE(true, pred) """ return np.sqrt(mean_squared_error(y_true=true, y_pred=pred))