Using frequentist_statistics functions from the jmspack package

Showing the usage of the following frequentist_statistics functions

  • normal_check()

  • correlation_analysis()

  • correlations_as_sample_increases()

  • multiple_univariate_OLSs()

  • potential_for_change_index()

[1]:
import os
tmp = os.getcwd()
os.chdir(tmp.split("jmspack")[0] + "jmspack")
[2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from jmspack.frequentist_statistics import (normal_check,
#                                             permute_test,
                                            correlation_analysis,
                                            correlations_as_sample_increases,
                                            multiple_univariate_OLSs,
                                            potential_for_change_index
                                           )
from jmspack.utils import apply_scaling
import pingouin as pg
import statsmodels.api as sm
[3]:
os.chdir(tmp)
[4]:
if "jms_style_sheet" in plt.style.available:
    _ = plt.style.use("jms_style_sheet")
[5]:
df = sns.load_dataset("iris")#.head(10)

normal_check()

compare the distribution of numeric variables to a normal distribution using the Kolmogrov-Smirnov test

[6]:
normal_check(df)
[6]:
feature p-value normality
0 sepal_length 0.170584 True
1 sepal_width 0.064491 True
2 petal_length 0.000011 False
3 petal_width 0.000200 False

correlation_analysis()

Run correlations for numerical features and return output in different formats

[7]:
feature1 = "sepal_length"
feature2 = "petal_length"
[8]:
X = df[feature1]
y = df[feature2]
[9]:
output = correlation_analysis(data=df,
                             check_norm=True,
                              dropna = 'pairwise',
                                permutation_test = False,
                                n_permutations = 10,
                                random_state=69420)
[10]:
output["summary"].round(4)
[10]:
analysis feature1 feature2 r-value p-value stat-sign N
0 Pearson sepal_length sepal_width -0.1176 0.1519 False 150
1 Spearman Rank sepal_length petal_length 0.8819 0.0000 True 150
2 Spearman Rank sepal_length petal_width 0.8343 0.0000 True 150
3 Spearman Rank sepal_width petal_length -0.3096 0.0001 True 150
4 Spearman Rank sepal_width petal_width -0.2890 0.0003 True 150
5 Spearman Rank petal_length petal_width 0.9377 0.0000 True 150

correlations_as_sample_increases()

Run correlations for subparts of the data to check robustness

[11]:
summary, fig = correlations_as_sample_increases(data=df.select_dtypes("number"),
                                 feature1=feature1,
                                feature2=feature2,
                                starting_N = 10,
                                step = 10,
                                method='pearson',
                                random_state=42,
                                bootstrap = True,
                                bootstrap_per_N = 20,
                                plot = True,
                                addition_to_title = '',
                                alpha = 0.05)
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
_images/frequentist_statistics_15_1.png

multiple_univariate_OLSs()

Tmp

[12]:
features_list = df.select_dtypes(float).columns.tolist()
[13]:
target = "species_cat_codes"
[14]:
df=df.assign(**{target: lambda x: x["species"].astype("category").cat.codes})
[15]:
X = df[features_list]
y = df[target]
[16]:
OLS_df = multiple_univariate_OLSs(X=X, y=y, features_list=features_list)
OLS_df
[16]:
coef std err t P>|t| [0.025 0.975] rsquared rsquared_adj
sepal_length 0.7742 0.051 15.292 0.0 0.674 0.874 0.612402 0.609783
sepal_width -0.8019 0.140 -5.739 0.0 -1.078 -0.526 0.182037 0.176510
petal_length 0.4404 0.012 36.632 0.0 0.417 0.464 0.900667 0.899996
petal_width 1.0281 0.026 39.910 0.0 0.977 1.079 0.914983 0.914408
[17]:
_ = sns.lmplot(data=df,
               x=features_list[3],
              y=target)
_images/frequentist_statistics_22_0.png

potential_for_change_index()

Calculate the potential for change index based on either variants of the r-squared (from linear regression) or the r-value (pearson correlation)

[18]:
df.head()
[18]:
sepal_length sepal_width petal_length petal_width species species_cat_codes
0 5.1 3.5 1.4 0.2 setosa 0
1 4.9 3.0 1.4 0.2 setosa 0
2 4.7 3.2 1.3 0.2 setosa 0
3 4.6 3.1 1.5 0.2 setosa 0
4 5.0 3.6 1.4 0.2 setosa 0
[19]:
pot_df = potential_for_change_index(data=df.select_dtypes("number"),
                                    features_list=features_list,
                                    target=target,
                                    minimum_measure = "min",
                                    centrality_measure = "median",
                                    maximum_measure = "max",
                                    weight_measure = "rsquared_adj",
                                    scale_data = True,
                                    pci_heatmap = True,
                                    pci_heatmap_figsize = (1, 3)
                                   )
_images/frequentist_statistics_25_0.png
[20]:
pot_df.round(3)
[20]:
PCI min median max rsquared_adj P>|t|
sepal_length 0.356 0.0 0.417 1.0 0.610 0.0
petal_length 0.389 0.0 0.568 1.0 0.900 0.0
petal_width 0.457 0.0 0.500 1.0 0.914 0.0
sepal_width 0.074 0.0 0.417 1.0 0.177 0.0
[21]:
pot_df = potential_for_change_index(data=df.select_dtypes("number"),
                                    features_list=features_list,
                                    target=target,
                                    minimum_measure = "min",
                                    centrality_measure = "median",
                                    maximum_measure = "max",
                                    weight_measure = "rsquared",
                                    scale_data = False,
                                    pci_heatmap = True,
                                    pci_heatmap_figsize = (1, 3)
                                   )
_images/frequentist_statistics_27_0.png
[22]:
pot_df.round(3)
[22]:
PCI min median max rsquared P>|t|
sepal_length 1.286 4.3 5.80 7.9 0.612 0.0
petal_length 2.297 1.0 4.35 6.9 0.901 0.0
petal_width 1.098 0.1 1.30 2.5 0.915 0.0
sepal_width 0.182 2.0 3.00 4.4 0.182 0.0
[23]:
pot_df = potential_for_change_index(data=df.select_dtypes("number"),
                                    features_list=features_list,
                                    target=target,
                                    minimum_measure = "min",
                                    centrality_measure = "mean",
                                    maximum_measure = "max",
                                    weight_measure = "r-value",
                                    scale_data = False,
                                    pci_heatmap = True,
                                    pci_heatmap_figsize = (1, 3)
                                   )
_images/frequentist_statistics_29_0.png
[24]:
pot_df.sort_values(by="PCI", ascending=False).round(3)
[24]:
PCI min mean max r-value p-value
petal_length 2.982 1.0 3.758 6.9 0.949 0.0
sepal_length 1.609 4.3 5.843 7.9 0.783 0.0
petal_width 1.244 0.1 1.199 2.5 0.957 0.0
sepal_width -0.451 2.0 3.057 4.4 -0.427 0.0