데이터 공부를 기록하는 공간

[classification] STAY or LEAVE 본문

STUDY/ADP, 빅데이터분석기사

[classification] STAY or LEAVE

BOTTLE6 2021. 3. 21. 11:07

1. 데이터 전처리

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("./mobile_cust_churn/mobile_cust_churn.csv")

df.drop(columns=['Unnamed: 0','id'], axis=1, inplace=True)

target = 'CHURN'

features = df.columns.tolist()[:-1]

numeric_features = df.select_dtypes(include=['int64']).columns.tolist()

category_features= []
for col in features:
    if col not in numeric_features:
        category_features.append(col)
    
df.isnull().sum().sum() # 0

# target 확인
df[target].value_counts()
data = pd.DataFrame(df[target].value_counts())
sns.barplot(data = data, x=data.index, y="CHURN")

for col in category_features:
    print('-'*30)
    print(col)
    print(pd.DataFrame(df[col].value_counts()))
    
for col in category_features:
    data = pd.DataFrame(df[col].value_counts())
    fig, (ax1,ax2) = plt.subplots(1,2,figsize=(10,4))
    sns.countplot(x=col, data=df, ax=ax1)
    sns.countplot(x=col, data=df, hue=target, ax=ax2)

for col in numeric_features:
    fig, ax = plt.subplots(figsize=(6,4))
    sns.kdeplot(data=df[col], color='k', label='all_data')
    sns.kdeplot(data=df, x=col, hue=target)

corrmat = df.corr()
fig, ax = plt.subplots(figsize=(10,6))
sns.heatmap(corrmat, vmax=0.8, annot=True)

# 상관계수 0.7 넘는 변수 관계 확인
a = ['HANDSET_PRICE','INCOME']
b = ['OVER_15MINS_CALLS_PER_MONTH','OVERAGE']

for col in a:
    fig, ax = plt.subplots(figsize=(6,4))
    sns.kdeplot(data=df[col], color='k', label='all_data')
    sns.kdeplot(data=df, x=col, hue=target)
    ax.set_title(f"{col}")
    
for col in b:
    fig, ax = plt.subplots(figsize=(6,4))
    sns.kdeplot(data=df[col], color='k', label='all_data')
    sns.kdeplot(data=df, x=col, hue=target)
    ax.set_title(f"{col}")

#category_features ▶ onehot
df_onehot = pd.get_dummies(df[category_features])

#merging
df2 = pd.concat([df[numeric_features], df_onehot],axis=1)
#df2

# features ▶ 상호작용
from sklearn.preprocessing import PolynomialFeatures

interaction = PolynomialFeatures(degree=2, include_bias=False)
feature_interaction = interaction.fit_transform(df2)
feature_interaction = pd.DataFrame(feature_interaction, columns=interaction.get_feature_names())
print(feature_interaction.shape)
#feature_interaction.head(3)

# target ▶ 이진변수 
y = np.where(df[target]=='STAY',1,0)

df2.shape, df[target].shape

2. 여러가지 모델로 모델링

from sklearn.model_selection import train_test_split

X_train, X_valtest, y_train, y_valtest = train_test_split(
    df2, y, test_size=0.5, random_state=24)
X_train_poly, X_valtest_poly, y_train_poly, y_valtest_poly = train_test_split(
    feature_interaction, df[target], test_size=0.5, random_state=24)
    
X_val, X_test, y_val, y_test = train_test_split(
	X_valtest, y_valtest, test_size=0.4, random_state=24)
X_val_poly, X_test_poly, y_val_poly, y_test_poly = train_test_split(
	X_valtest_poly, y_valtest_poly, test_size=0.4, random_state=24)    
for X in [X_train, X_val, X_test]:
    for col in X.columns:
        X[col] = X[col].astype(np.int64)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

results = pd.DataFrame(columns=['accuracy', 'accuracy_poly'])

#KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn_poly= KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn_poly.fit(X_train_poly, y_train_poly)
pred_knn = knn.predict(X_val)
pred_knn_poly = knn_poly.predict(X_val_poly)
results.loc['knn'] = [accuracy_score(y_val, pred_knn), accuracy_score(y_val_poly, pred_knn_poly)]

# Logistic Regression
lr = LogisticRegression()
lr_poly = LogisticRegression()
lr.fit(X_train, y_train)
lr_poly.fit(X_train_poly, y_train_poly)
pred_lr = lr.predict(X_val)
pred_lr_poly = lr_poly.predict(X_val_poly)
results.loc['lr'] = [accuracy_score(y_val, pred_lr), accuracy_score(y_val_poly, pred_lr_poly)]

# Decision Tree
dt = DecisionTreeClassifier()
dt_poly = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_poly.fit(X_train_poly, y_train_poly)
pred_dt = dt.predict(X_val)
pred_dt_poly = dt_poly.predict(X_val_poly)
results.loc['dt'] = [accuracy_score(y_val, pred_dt), accuracy_score(y_val_poly, pred_dt_poly)]

# RandomForest Classifier
rf = RandomForestClassifier()
rf_poly = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_poly.fit(X_train_poly, y_train_poly)
pred_rf = rf.predict(X_val)
pred_rf_poly = rf_poly.predict(X_val_poly)
results.loc['rf'] = [accuracy_score(y_val, pred_rf), accuracy_score(y_val_poly, pred_rf_poly)]

#XGBClassifier
xgb = XGBClassifier()
xgb_poly = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_poly.fit(X_train_poly, y_train_poly)
pred_xgb = xgb.predict(X_val)
pred_xgb_poly = xgb_poly.predict(X_val_poly)
results.loc['xgb'] = [accuracy_score(y_val, pred_xgb), accuracy_score(y_val_poly, pred_xgb_poly)]
results

3. 교차검증

# 교차검증은 train, valid를구분해주므로 이를 합침

X = pd.concat([X_train, X_val],axis=0)
y = list(y_train) + list(y_val)
from sklearn.model_selection import GridSearchCV

# RandomForest
param_grid = dict(n_estimators=[50,100,300],
                 max_depth=[3,5,10,None])

gridsearch = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, verbose=1, n_jobs=-1)
best_model = gridsearch.fit(X,y)
print("n_estimators :",best_model.best_estimator_.get_params()['n_estimators'])
print("max_depth :", best_model.best_estimator_.get_params()['max_depth'])

# 모델평가하기
from sklearn.model_selection import cross_val_score

cross_val_score(gridsearch, StandardScaler().fit_transform(X), y, cv=3).mean()

# SVM 규제하기 

from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

svc = LinearSVC(C=1)
param_grid = dict(C=[0.01,0.1,1,10,100])
gridsearch_svc = GridSearchCV(svc, param_grid, cv=5, verbose=1, n_jobs=-1)
cross_val_score(gridsearch_svc, StandardScaler().fit_transform(X), y, cv=3)
best_model.best_estimator_

# 평가지표 변경해보기

svc = LinearSVC(C=100)
cross_val_score(svc, StandardScaler().fit_transform(X), y, 
                cv=3, n_jobs=-1, scoring='accuracy')

svc = LinearSVC(C=100)
cross_val_score(svc, StandardScaler().fit_transform(X), y, 
                cv=3, n_jobs=-1, scoring='precision')
                
svc = LinearSVC(C=100)
cross_val_score(svc, StandardScaler().fit_transform(X), y, 
                cv=3, n_jobs=-1, scoring='recall')
                
svc = LinearSVC(C=100)
cross_val_score(svc, StandardScaler().fit_transform(X), y, 
                cv=3, n_jobs=-1, scoring='f1')

#confusion_matrix

from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

rf = RandomForestClassifier(n_estimators=100, max_depth=10)
ss = StandardScaler()
ss.fit(X_train)

rf.fit(ss.transform(X_train), y_train)
pred_rf = rf.predict(ss.transform(X_val))
cmat = confusion_matrix(y_val, pred_rf)
print(classification_report(y_val, pred_rf))
print(cmat)
print("accuracy : ", (cmat[0][0]+cmat[1][1])/cmat.sum())
print("precision : ", (cmat[1][1])/(cmat[0][1]+cmat[1][1]))
print("precision2 : ", precision_score(y_val, pred_rf))

                                    

# feature importance

high_importances = pd.Series(rf.feature_importances_,
                             index=X_train.columns).sort_values(ascending=False)
high_importances[:8]

high = high_importances[:8].index
rf = RandomForestClassifier(n_estimators=100, max_depth=10)
cross_val_score(rf, StandardScaler().fit_transform(X[high]), y, cv=3, n_jobs=-1)

                                             ▶ 상위 8개 특성으로 해도 결과는 별로 다르지 않음

 

# ROC_AUC

from sklearn.metrics import roc_curve, roc_auc_score

ss = StandardScaler()
ss.fit(X_train)

logit = LogisticRegression()
logit.fit(ss.transform(X_train), y_train)
target_probabilities = logit.predict_proba(ss.transform(X_val))[:,1]
target_probabilities

false_positive_rate, true_positive_rate, threshold = roc_curve(y_val, target_probabilities)

plt.title("Receiver Operationg Charasteristic")
plt.plot(false_positive_rate, true_positive_rate)
plt.plot([0,1], ls='--')
plt.plot([0,0], [1,0], c='.7'), plt.plot([1,1], c='.7')
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")

idx = 2661
print("임계값 :", threshold[idx])
print("True Positive Rate :", true_positive_rate[idx])
print("False Positive Rate :", false_positive_rate[idx])

roc_auc_score(y_val, target_probabilities)

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

ss = StandardScaler()
ss.fit(X_train)

logit = LogisticRegression()
logit.fit(ss.transform(X_train), y_train)
target_probabilities = logit.predict_proba(ss.transform(X_val))[:,1]
target_probabilities
precision, recall, threshold = precision_recall_curve(y_val, target_probabilities)

plt.title("Precision-Recall Curve")
plt.plot(precision, recall)
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.plot([0,1],ls='--')
plt.plot([1,1],c=".7"), plt.plot([1,1],[1,0],c=".7")
plt.show()

print(average_precision_score(y_val, target_probabilities))

Cs=[0.01, 0.1, 1, 10,100]
ss = StandardScaler()
ss.fit(X_train)

for C in Cs:
    logit = LogisticRegression(C=C)
    logit.fit(ss.transform(X_train), y_train)
    target_probabilities = logit.predict_proba(ss.transform(X_val))[:,1]
    target_probabilities
    precision, recall, threshold = precision_recall_curve(y_val, target_probabilities)
    plt.title(f"C={C} Precision-Recall Curve")
    plt.plot(precision, recall)
    plt.xlabel("Precision")
    plt.ylabel("Recall")
    plt.plot([0,1],ls='--')
    plt.plot([1,1],c=".7"), plt.plot([1,1],[1,0],c=".7")
    plt.show()
    print(f"------ C = {C} ------")
    print(average_precision_score(y_val, target_probabilities))

'STUDY > ADP, 빅데이터분석기사' 카테고리의 다른 글

[pca] iris  (0) 2021.03.21
[clustering] Mall_Customers  (0) 2021.03.21
[ARIMA] airplane  (0) 2021.03.20
[classification] titanic  (0) 2021.03.20
[regression] restaurant revenue prediction  (0) 2021.03.20
Comments