Search This Blog

7 Exploratory Data Analysis (EDA)

Exploratory Data Analysis


# importing the necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

data_socialnetwork_ads=pd.read_csv('Social_Network_Ads.csv')
data_socialnetwork_ads.head()
Out:
AgeEstimatedSalaryPurchased
019190000
135200000
226430000
327570000
419760000

data_socialnetwork_ads.info(verbose =True)
O/p:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Age              414 non-null    int64
 1   EstimatedSalary  414 non-null    int64
 2   Purchased        414 non-null    int64
dtypes: int64(3)
memory usage: 9.8 KB


:
data_socialnetwork_ads.describe( include="all")  # include='all' , includes categorical values also
O/p:

AgeEstimatedSalaryPurchased
count414.00000414.000000414.000000
mean37.1062868760.8695650.364734
std11.4970634876.2365970.481938
min0.000000.0000000.000000
25%29.0000043000.0000000.000000
50%37.0000069500.0000000.000000
75%46.0000087000.0000001.000000
max60.00000150000.0000001.000000


data_socialnetwork_ads.describe( include="all").T  # .T   --to transpose dataset
:
countmeanstdmin25%50%75%max
Age414.037.10628011.4970600.029.037.046.060.0
EstimatedSalary414.068760.86956534876.2365970.043000.069500.087000.0150000.0
Purchased414.00.3647340.4819380.00.00.01.01.0

Age EstimatedSalary cant be zero, lets check if there any

:
data_socialnetwork_ads.isnull().sum()
:
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

:
#repalce 0 with nan, so it would be easy to work

data_socialnetwork_ads_copy = data_socialnetwork_ads.copy(deep = True) 
# replacing the 0 values with Nan

data_socialnetwork_ads_copy[['Age','EstimatedSalary']]=data_socialnetwork_ads_copy[['Age','EstimatedSalary']].replace(0,np.NaN)

## count Nans
data_socialnetwork_ads_copy.isnull().sum()
:
Age                7
EstimatedSalary    7
Purchased          0
dtype: int64

:
## null count analysis
import missingno as msno
p=msno.bar(data_socialnetwork_ads_copy,figsize=(3,3))


:
p=data_socialnetwork_ads_copy.hist(figsize = (10,10))


:
data_socialnetwork_ads_copy['Age'].fillna(data_socialnetwork_ads_copy['Age'].mean(), inplace = True)
data_socialnetwork_ads_copy['EstimatedSalary'].fillna(data_socialnetwork_ads_copy['EstimatedSalary'].mean(), inplace = True)

:
data_socialnetwork_ads_copy.isnull().sum()
O/p:
:
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

looks fine now

:
#Lets check correlation b/n columns
#The value of Pearson's Correlation Coefficient can be between -1 to +1. 
#1 means that they are highly correlated and 0 means no correlation.

plt.figure(figsize=(6,5)) 
p=sns.heatmap(data_socialnetwork_ads_copy.corr(), annot=True,cmap ='YlGnBu')


Scaling the data

:
X = data_socialnetwork_ads_copy.drop("Purchased",axis = 1)
y = data_socialnetwork_ads_copy.Purchased
X.head()
:
AgeEstimatedSalary
019.019000.0
135.020000.0
226.043000.0
327.057000.0
419.076000.0
Data scales highly different lets scale them

# scaling the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X =  pd.DataFrame(sc.fit_transform(X))

X.head() #see the scalled data
01
0-1.802168-1.514074
1-0.263864-1.484353
2-1.129160-0.800778
3-1.033016-0.384689
4-1.8021680.180003
Scaled data looks fine

Lets Split the DATA to Train Test
#importing train_test_split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3,random_state=42, stratify=y)


:
from sklearn.neighbors import KNeighborsClassifier


test_scores = []
train_scores = []

for i in range(1,15):

    knn = KNeighborsClassifier(i)
    knn.fit(X_train,y_train)
    
    train_scores.append(knn.score(X_train,y_train))
    test_scores.append(knn.score(X_test,y_test))

Result Visualisation

:
plt.figure(figsize=(12,5))
p = sns.lineplot(range(1,15),train_scores,marker='*',label='Train Score')
p = sns.lineplot(range(1,15),test_scores,marker='o',label='Test Score')

:
## score that comes from testing on the same datapoints that were used for training
max_train_score = max(train_scores)
train_scores_ind = [i for i, v in enumerate(train_scores) if v == max_train_score]
print('Max train score {} % and k = {}'.format(max_train_score*100,list(map(lambda x: x+1, train_scores_ind))))
Max train score 99.27536231884058 % and k = [1]
:
## score that comes from testing on the datapoints that were split in the beginning to be used for testing solely
max_test_score = max(test_scores)
test_scores_ind = [i for i, v in enumerate(test_scores) if v == max_test_score]
print('Max test score {} % and k = {}'.format(max_test_score*100,list(map(lambda x: x+1, test_scores_ind))))
Max test score 89.13043478260869 % and k = [11]

# use k=11
#Setup a knn classifier with k=11 neighbors
knn = KNeighborsClassifier(11)

knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.8913043478260869

confusion matrix:

y_pred = knn.predict(X_test)
y_pred

:
array([0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0], dtype=int64)

:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
:
Predicted01All
Actual
082688
194150
All9147138

:
cnf_matrix = confusion_matrix(y_test, y_pred)
p = sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="RdYlGn" ,fmt='g')
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
:
Text(0.5, 15.0, 'Predicted label')

Analize Classification

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.93      0.92        88
           1       0.87      0.82      0.85        50

    accuracy                           0.89       138
   macro avg       0.89      0.88      0.88       138
weighted avg       0.89      0.89      0.89       138
:
#ROC
y_pred_proba = knn.predict_proba(X_test)[:,1]
:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
print('FPR')
print(fpr)
print('TPR')
print(tpr)
print('Thresholds')
print(thresholds)
FPR
[0.         0.01136364 0.02272727 0.04545455 0.04545455 0.05681818
 0.06818182 0.07954545 0.07954545 0.09090909 0.13636364 0.30681818
 1.        ]
TPR
[0.   0.1  0.52 0.62 0.74 0.76 0.82 0.84 0.88 0.88 0.9  0.96 1.  ]
Thresholds
[2.         1.         0.90909091 0.81818182 0.72727273 0.63636364
 0.54545455 0.45454545 0.36363636 0.27272727 0.18181818 0.09090909
 0.        ]

:
# Plotting the ROC Curve
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('Knn(n_neighbors=11) ROC curve')
plt.show()

:
#Area under ROC curve
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred_proba)
0.9322727272727271


Hyper Parameter optimization

:
#import GridSearchCV
from sklearn.model_selection import GridSearchCV
#In case of classifier like knn the parameter to be tuned is n_neighbors

param_grid = {'n_neighbors':np.arange(1,50)}
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(X,y)

print("Best Score:" + str(knn_cv.best_score_))
print("Best Parameters: " + str(knn_cv.best_params_))

Best Score:0.8985013223626211
Best Parameters: {'n_neighbors': 12}


try above k=12 and try the model again
:
#Setup a knn classifier with k=12 neighbors
knn = KNeighborsClassifier(12)

knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.8768115942028986










No comments:

Post a Comment