Source code for sktidy.sktidy

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans

import statsmodels.api as sm
from sklearn.utils.validation import check_is_fitted


[docs]def tidy_lr(model, X, y): """ Returns a tidy dataframe for sklearn LinearRegression model with feature \ names, coefficients/intercept and p-values Parameters ---------- model : sklearn.linear_model.LinearRegression The fitted sklearn LinearRegression model X: pandas.core.frame.DataFrame The feature pandas dataframe to which the LinearRegression object was \ fitted with m rows and n columns y: pandas.core.series.Series The target pandas Series to which the LinearRegression object was \ fitted with m rows Returns ------- df : pandas.core.frame.DataFrame A pandas dataframe with n+1 rows, where n is the number of \ columns(features) in the input dataframe `X` that was fitted to the model and 3 columns, describing feature names, \ coefficients/intercept and p-values Examples -------- >>> from sklearn.linear_model import LinearRegression >>> from sklearn import datasets >>> import pandas as pd >>> import sktidy >>> # Load data and traning the linear regression model >>> X = datasets.load_iris(return_X_y = True, as_frame = True)[0] >>> y = datasets.load_iris(return_X_y = True, as_frame = True)[1] >>> my_lr = LinearRegression() >>> my_lr.fit(X,y) >>> # Get tidy output for the trained sklearn LinearRegression model >>> tidy_lr(model = my_lr, X = X, y = y) """ # raise error when model is not a sklearn LinearRegression object if not isinstance(model, LinearRegression): raise TypeError( "Input model should be of type \ 'sklearn.linear_model.LinearRegression'." ) # raise error when X is not a pandas dataframe object if not isinstance(X, pd.core.frame.DataFrame): raise TypeError( "Input X should be of type \ 'pandas.core.frame.DataFrame'." ) # raise error when y is not a pandas Series object if not isinstance(y, pd.core.series.Series): raise TypeError( "Input y should be of type \ 'pandas.core.series.Series'." ) # raise error when model is not fitted yet check_is_fitted(model) # obtain coefficients and intercept est = np.append(model.intercept_, model.coef_) # obtain feature names fea = np.append(np.array(["intercept"]), X.columns.values) # obtain p-values exog = sm.add_constant(X) mod = sm.OLS(y, exog) results = mod.fit() p_val = np.round(results.pvalues.reset_index(drop=True), 4) # assemble output dataframe df = pd.DataFrame(zip(fea, est, p_val)) df.columns = ["feature", "coefficient", "p-value"] return df
[docs]def tidy_kmeans(model, X): """ Return a tidy df of cluster information for a kmeans clustering algorithm This function delivers diagnostic information about each cluster defined \ by an instance of scikit learn's implementation of kmeans clustering \ including total intertia in each cluster, cluster center, and \ total number of points associated with each cluster. Parameters ---------- model : sklearn.cluster.KMeans The model to extract the cluster specific information from. X : pandas dataframe The data to which the Kmeans object has been fitted Returns ------- df : pandas dataframe A dataframe with k rows, where k is the number of clusters and 3 \ columns,describing respectively the center of the cluster, the sum of \ inertia of the cluster, and the number of associated data points in a \ cluster. Examples -------- >>> # Importing packages >>> from sklearn.cluster import DBSCAN, KMeans >>> from sklearn import datasets >>> import pandas as pd >>> import sktidy >>> # Extracting data and training the clustering algorithm >>> df = datasets.load_iris(return_X_y = True, as_frame = True)[0] >>> kmeans_clusterer = KMeans() >>> kmeans_clusterer.fit(df) >>> # Getting the tidy df of cluster information >>> tidy_kmeans(model = kmeans_clusterer, X = df) """ # raise error when model is not a sklearn LinearRegression object if not isinstance(model, KMeans): raise TypeError( "Input model should be of type \ 'sklearn.cluster.KMeans'" ) # raise error when X is not a pandas dataframe object if not isinstance(X, pd.core.frame.DataFrame): raise TypeError( "Input DataFrame should be of type 'pandas.core.frame.DataFrame'." ) # raise error when model is not fitted yet check_is_fitted(model) cluster_labels, cluster_counts = np.unique(model.labels_, return_counts=True) # Creating a list that we'll fill with dfs corresponding to the kmeans \ # centroids with column labels centers_list = [] for cluster in cluster_labels: # Getting the cluster center for the given each cluster, reshaping it \ # so pandas behaves itself later cluster_center = model.cluster_centers_[cluster].reshape( 1, cluster_labels.shape[0] ) # Creating a df, adding labels from origional dataframe cluster_center_df = pd.DataFrame(cluster_center, columns=X.columns) centers_list.append(cluster_center_df) df = pd.DataFrame( { "cluster_number": cluster_labels, # "cluster_inertia" : cluster_labels, "center_values": centers_list, "n_points": cluster_counts, } ) return df
[docs]def augment_lr(model, X, y): """ Adds two columns to the original data of the scikit learn's linear \ regression model. This includes predictions and residuals. Parameters ---------- model : sklearn.linear_model.LinearRegression object The fitted sklearn LinearRegression model X : pandas.core.frame.DataFrame A dataframe of explanatory variables to predict on. Shaped n \ observations by m features. y : pandas.core.series.Series A pandas series of response variables to predict on. Shaped n \ observations by 1. Returns ------- df : pandas.core.frame.DataFrame A dataframe with the original data plus two additional columns for \ predictions and residuals. Shaped n observations by m features + 2. Examples -------- >>> # Importing packages >>> from sklearn.linear_model import LinearRegression >>> from sklearn import datasets >>> import pandas as pd >>> import sktidy >>> # Extracting data and traning the linear regression model >>> X = datasets.load_iris(return_X_y = True, as_frame = True)[0] >>> y = datasets.load_iris(return_X_y = True, as_frame = True)[1] >>> lr_model = LinearRegression() >>> lr_model.fit(X,y) >>> # Getting the tidy df of linear regression model output >>> augment_lr(model = lr_model,X = X,y = y) """ # raise error when model is not a sklearn LinearRegression object if not isinstance(model, LinearRegression): raise TypeError( "Input model should be of type \ 'sklearn.linear_model.LinearRegression'." ) # raise error when X is not a pandas dataframe object if not isinstance(X, pd.core.frame.DataFrame): raise TypeError( "Input X should be of type \ 'pandas.core.frame.DataFrame'." ) # raise error when y is not a pandas Series object if not isinstance(y, pd.core.series.Series): raise TypeError( "Input y should be of type \ 'pandas.core.series.Series'." ) # raise error when X is empty if len(X) == 0: raise ValueError("Input X should not be empty") # raise error when Y is empty if len(y) == 0: raise ValueError("Input Y should not be empty") # raise error when model is not fitted yet check_is_fitted(model) # calculate predictions and residuals pred = model.predict(X) res = y - pred # create dataframe to return df = X.join(y) df["predictions"] = pred df["residuals"] = res return df
[docs]def augment_kmeans(model, X): """ This function returns a dataframe of the original samples with their \ assigned clusters based on predictions make by an instance of scikit \ learn's implementation of KMeans clustering. Parameters ---------- model : sklearn.cluster.KMeans The model to extract the cluster specific information from X : pandas dataframe The data to which the Kmeans object has been fitted Returns ------- df : pandas dataframe A dataframe with k rows, where k is the number of examples in X and \ 2 columns of the data points in X and their corresponding predicted \ label Examples -------- >>> # Importing packages >>> from sklearn.cluster import KMeans >>> from sklearn import datasets >>> import pandas as pd >>> import sktidy >>> # Extracting data and traning the clustering algorithm >>> df = datasets.load_iris(return_X_y = True, as_frame = True)[0] >>> kmeans_clusterer = KMeans() >>> kmeans_clusterer.fit(df) >>> # Getting cluster assignment for each data point >>> augment_kmeans(model = kmeans_clusterer, X = df) """ # raise error when model is not a sklearn KMeans object if not isinstance(model, KMeans): raise TypeError( "Input model should be of type 'sklearn.cluster._kmeans.KMeans'." ) # raise error when X is not a pandas dataframe object if not isinstance(X, pd.core.frame.DataFrame): raise TypeError( "Input X should be of type \ 'pandas.core.frame.DataFrame'." ) # raise error when X is empty if len(X) == 0: raise ValueError("Input X should not be empty") # raise error when model is not fitted yet check_is_fitted(model) # create dataframe to return df = X.copy() df["cluster"] = model.predict(X) return df