# -*- coding: utf-8 -*-
"""
Created on Thu Mar  3 17:55:28 2022

@author: TANISH
"""


""" Logistic Regression """


""" Data Importing """
import pandas as pd

crm = pd.read_excel(r"C:\Users\tanis\Desktop\CRM Dataset-Logi.xlsx")
crm.drop_duplicates("ID",inplace = True)

crm = crm[(crm.Age >= 18) &
                (crm.Age <= 60) &
       (crm.Gross_income >= crm.Net_income) &
       (crm.Net_income <= 5500000)]


crm.drop("ID",inplace = True,axis = 1)
crm.dropna(axis = 0,inplace = True)

crm.isna().sum().sum()


""" Possible Check for Multi-Collinearity """

import seaborn as sb

sb.heatmap(crm.corr())


""" Creating Dummy Variables 
    and coverting Target variable into
    categorical type """
    

from sklearn import preprocessing

Label_Encoder = preprocessing.LabelEncoder()

crm["Target"] = Label_Encoder.fit_transform(crm["Target"])


crm = pd.get_dummies(crm,
                     columns = ["Customer_type",
                                "SEX"])

""" Problem Of Imbalance """

sb.countplot(crm.Target)

crm.Target.value_counts() * 100 / crm.shape[0]



""" Train Test Split Using Stratified CV """

X = crm.loc[:,crm.columns != "Target"]
Y = crm.loc[:,crm.columns == "Target"]


import sklearn.model_selection as ms


X_train, X_test, Y_train, Y_test = ms.train_test_split(X,Y,
                                      test_size = 0.30,
                                      random_state = 11,
                                      stratify = Y)

Y_train.Target.value_counts() * 100 / Y_train.shape[0]
Y_test.Target.value_counts() * 100 / Y_test.shape[0]



""" Oversampling """

Y_train.Target.value_counts() * 100 / Y_train.shape[0]

sb.countplot(x = Y_train.Target)

# We need to install a package named: "imblearn"

# Steps to install:
# Go to Start and search for Anaconda Prompt
# Type: conda install -c conda-forge imbalanced-learn
# Type "y" when asked the option to proceed
# and wait until the installation gets over

from imblearn.over_sampling import RandomOverSampler

OvSm = RandomOverSampler(sampling_strategy = 0.75,
                         random_state = 11)

X_OTrain, Y_OTrain = OvSm.fit_resample(X_train, Y_train)


Y_OTrain.Target.value_counts() * 100 / Y_OTrain.shape[0]
sb.countplot(x = Y_OTrain.Target)


""" Identifying Highly Correlated Regressors """


from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
import statsmodels.tools.tools as stt

vif = pd.DataFrame()
vif["Feature"] = X_OTrain.columns


X_OTrain = stt.add_constant(X_OTrain)

vif["VIF"] = [VIF(X_OTrain.values,
                  i) for i in range(X_OTrain.shape[1] - 1)]


vif = vif[vif["VIF"] < 5]


X_OTrain = X_OTrain.loc[:, vif.iloc[:,0]]



""" Fitting The Model """

from statsmodels.api import Logit

model = Logit(endog = Y_OTrain,
              exog = X_OTrain).fit()

model.summary()


""" Alternative """

from sklearn.linear_model import LogisticRegression

lgr = LogisticRegression(max_iter = 500)

model = lgr.fit(X_OTrain,Y_OTrain.Target)


""" Model Validation """


X_test = X_test.loc[:, vif.iloc[:,0]]
Y_Pred = model.predict(X_test)


from sklearn.metrics import confusion_matrix

cnfm = confusion_matrix(y_true = Y_test,
                 y_pred = Y_Pred)

sb.heatmap(cnfm,annot = True)

Y_test.value_counts()



""" Other Metric """

from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import  f1_score, accuracy_score

precision_score(Y_test, Y_Pred)
recall_score(Y_test, Y_Pred)
f1_score(Y_test, Y_Pred)
accuracy_score(Y_test, Y_Pred)



""" ROC Curve & AUC """

from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

pred_prob = model.predict_proba(X_test)


fpr, tpr, thresh = roc_curve(Y_test, pred_prob[:,1])


random_probs = [0 for i in range(len(Y_test))]
p_fpr, p_tpr, _ = roc_curve(Y_test, random_probs)


plt.plot(fpr, tpr, linestyle = "--")

plt.plot(p_fpr, p_tpr, linestyle = "--", color = "blue")
plt.title("ROC curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive rate")


from sklearn.metrics import roc_auc_score

auc_score = roc_auc_score(Y_test, pred_prob[:,1])
print(auc_score)









""" Gain Chart """

pred_prob = model.predict_proba(X_test)


import scikitplot as skplt

skplt.metrics.plot_cumulative_gain(Y_test, pred_prob)

