Exploratory Data Analysis
# importing the necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as pltdata_socialnetwork_ads=pd.read_csv('Social_Network_Ads.csv')
data_socialnetwork_ads.head()Out:
data_socialnetwork_ads.info(verbose =True)O/p:
<class 'pandas.core.frame.DataFrame'> RangeIndex: 414 entries, 0 to 413 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 414 non-null int64 1 EstimatedSalary 414 non-null int64 2 Purchased 414 non-null int64 dtypes: int64(3) memory usage: 9.8 KB
:data_socialnetwork_ads.describe( include="all") # include='all' , includes categorical values also
O/p:
data_socialnetwork_ads.describe( include="all").T # .T --to transpose dataset:
Age
EstimatedSalary
cant be zero, lets check if there any
:
data_socialnetwork_ads.isnull().sum():
:
p=data_socialnetwork_ads_copy.hist(figsize = (10,10)):
data_socialnetwork_ads_copy['Age'].fillna(data_socialnetwork_ads_copy['Age'].mean(), inplace = True)
data_socialnetwork_ads_copy['EstimatedSalary'].fillna(data_socialnetwork_ads_copy['EstimatedSalary'].mean(), inplace = True):
data_socialnetwork_ads_copy.isnull().sum()O/p:
:
Scaling the data
:
X = data_socialnetwork_ads_copy.drop("Purchased",axis = 1)
y = data_socialnetwork_ads_copy.Purchased
X.head():
Data scales highly different lets scale them
# scaling the data from sklearn.preprocessing import StandardScaler sc = StandardScaler() X = pd.DataFrame(sc.fit_transform(X))
X.head() #see the scalled dataScaled data looks fine
Lets Split the DATA to Train Test
#importing train_test_split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3,random_state=42, stratify=y):
from sklearn.neighbors import KNeighborsClassifier
test_scores = []
train_scores = []
for i in range(1,15):
knn = KNeighborsClassifier(i)
knn.fit(X_train,y_train)
train_scores.append(knn.score(X_train,y_train))
test_scores.append(knn.score(X_test,y_test))Result Visualisation
:
plt.figure(figsize=(12,5))
p = sns.lineplot(range(1,15),train_scores,marker='*',label='Train Score')
p = sns.lineplot(range(1,15),test_scores,marker='o',label='Test Score'):
## score that comes from testing on the same datapoints that were used for training
max_train_score = max(train_scores)
train_scores_ind = [i for i, v in enumerate(train_scores) if v == max_train_score]
print('Max train score {} % and k = {}'.format(max_train_score*100,list(map(lambda x: x+1, train_scores_ind))))Max train score 99.27536231884058 % and k = [1]
:
## score that comes from testing on the datapoints that were split in the beginning to be used for testing solely
max_test_score = max(test_scores)
test_scores_ind = [i for i, v in enumerate(test_scores) if v == max_test_score]
print('Max test score {} % and k = {}'.format(max_test_score*100,list(map(lambda x: x+1, test_scores_ind))))Max test score 89.13043478260869 % and k = [11]
# use k=11 #Setup a knn classifier with k=11 neighbors knn = KNeighborsClassifier(11) knn.fit(X_train,y_train) knn.score(X_test,y_test)
0.8913043478260869
confusion matrix:
y_pred = knn.predict(X_test) y_pred
:
:from sklearn.metrics import confusion_matrix confusion_matrix(y_test,y_pred) pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
:
:cnf_matrix = confusion_matrix(y_test, y_pred) p = sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="RdYlGn" ,fmt='g') plt.title('Confusion matrix', y=1.1) plt.ylabel('Actual label') plt.xlabel('Predicted label')
:
Analize Classification
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred)):
# Plotting the ROC Curve
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('Knn(n_neighbors=11) ROC curve')
plt.show():
#Area under ROC curve
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred_proba)0.9322727272727271
Hyper Parameter optimization
:#import GridSearchCV from sklearn.model_selection import GridSearchCV #In case of classifier like knn the parameter to be tuned is n_neighbors param_grid = {'n_neighbors':np.arange(1,50)} knn = KNeighborsClassifier() knn_cv= GridSearchCV(knn,param_grid,cv=5) knn_cv.fit(X,y) print("Best Score:" + str(knn_cv.best_score_)) print("Best Parameters: " + str(knn_cv.best_params_))
Best Score:0.8985013223626211
Best Parameters: {'n_neighbors': 12}try above k=12 and try the model again:#Setup a knn classifier with k=12 neighbors knn = KNeighborsClassifier(12) knn.fit(X_train,y_train) knn.score(X_test,y_test)
0.8768115942028986
No comments:
Post a Comment