Notice
Recent Posts
Recent Comments
Link
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | ||||||
2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 17 | 18 | 19 | 20 | 21 | 22 |
23 | 24 | 25 | 26 | 27 | 28 |
Tags
- 데이터분석전문가
- docker
- TimeSeries
- 프로그래머스
- 볼린저밴드
- 파트5
- 주식
- randomforest
- ADP
- backtest
- Crawling
- 비트코인
- 파이썬
- 실기
- 백테스트
- 빅데이터분석기사
- lstm
- SQL
- Python
- 데이터분석
- hackerrank
- 파이썬 주식
- PolynomialFeatures
- 코딩테스트
- 변동성돌파전략
- Programmers
- 토익스피킹
- Quant
- sarima
- GridSearchCV
Archives
- Today
- Total
데이터 공부를 기록하는 공간
[classification] STAY or LEAVE 본문
1. 데이터 전처리
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("./mobile_cust_churn/mobile_cust_churn.csv")
df.drop(columns=['Unnamed: 0','id'], axis=1, inplace=True)
target = 'CHURN'
features = df.columns.tolist()[:-1]
numeric_features = df.select_dtypes(include=['int64']).columns.tolist()
category_features= []
for col in features:
if col not in numeric_features:
category_features.append(col)
df.isnull().sum().sum() # 0
# target 확인
df[target].value_counts()
data = pd.DataFrame(df[target].value_counts())
sns.barplot(data = data, x=data.index, y="CHURN")
for col in category_features:
print('-'*30)
print(col)
print(pd.DataFrame(df[col].value_counts()))
for col in category_features:
data = pd.DataFrame(df[col].value_counts())
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(10,4))
sns.countplot(x=col, data=df, ax=ax1)
sns.countplot(x=col, data=df, hue=target, ax=ax2)
for col in numeric_features:
fig, ax = plt.subplots(figsize=(6,4))
sns.kdeplot(data=df[col], color='k', label='all_data')
sns.kdeplot(data=df, x=col, hue=target)
corrmat = df.corr()
fig, ax = plt.subplots(figsize=(10,6))
sns.heatmap(corrmat, vmax=0.8, annot=True)
# 상관계수 0.7 넘는 변수 관계 확인
a = ['HANDSET_PRICE','INCOME']
b = ['OVER_15MINS_CALLS_PER_MONTH','OVERAGE']
for col in a:
fig, ax = plt.subplots(figsize=(6,4))
sns.kdeplot(data=df[col], color='k', label='all_data')
sns.kdeplot(data=df, x=col, hue=target)
ax.set_title(f"{col}")
for col in b:
fig, ax = plt.subplots(figsize=(6,4))
sns.kdeplot(data=df[col], color='k', label='all_data')
sns.kdeplot(data=df, x=col, hue=target)
ax.set_title(f"{col}")
#category_features ▶ onehot
df_onehot = pd.get_dummies(df[category_features])
#merging
df2 = pd.concat([df[numeric_features], df_onehot],axis=1)
#df2
# features ▶ 상호작용
from sklearn.preprocessing import PolynomialFeatures
interaction = PolynomialFeatures(degree=2, include_bias=False)
feature_interaction = interaction.fit_transform(df2)
feature_interaction = pd.DataFrame(feature_interaction, columns=interaction.get_feature_names())
print(feature_interaction.shape)
#feature_interaction.head(3)
# target ▶ 이진변수
y = np.where(df[target]=='STAY',1,0)
df2.shape, df[target].shape
2. 여러가지 모델로 모델링
from sklearn.model_selection import train_test_split
X_train, X_valtest, y_train, y_valtest = train_test_split(
df2, y, test_size=0.5, random_state=24)
X_train_poly, X_valtest_poly, y_train_poly, y_valtest_poly = train_test_split(
feature_interaction, df[target], test_size=0.5, random_state=24)
X_val, X_test, y_val, y_test = train_test_split(
X_valtest, y_valtest, test_size=0.4, random_state=24)
X_val_poly, X_test_poly, y_val_poly, y_test_poly = train_test_split(
X_valtest_poly, y_valtest_poly, test_size=0.4, random_state=24)
for X in [X_train, X_val, X_test]:
for col in X.columns:
X[col] = X[col].astype(np.int64)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
results = pd.DataFrame(columns=['accuracy', 'accuracy_poly'])
#KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn_poly= KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn_poly.fit(X_train_poly, y_train_poly)
pred_knn = knn.predict(X_val)
pred_knn_poly = knn_poly.predict(X_val_poly)
results.loc['knn'] = [accuracy_score(y_val, pred_knn), accuracy_score(y_val_poly, pred_knn_poly)]
# Logistic Regression
lr = LogisticRegression()
lr_poly = LogisticRegression()
lr.fit(X_train, y_train)
lr_poly.fit(X_train_poly, y_train_poly)
pred_lr = lr.predict(X_val)
pred_lr_poly = lr_poly.predict(X_val_poly)
results.loc['lr'] = [accuracy_score(y_val, pred_lr), accuracy_score(y_val_poly, pred_lr_poly)]
# Decision Tree
dt = DecisionTreeClassifier()
dt_poly = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_poly.fit(X_train_poly, y_train_poly)
pred_dt = dt.predict(X_val)
pred_dt_poly = dt_poly.predict(X_val_poly)
results.loc['dt'] = [accuracy_score(y_val, pred_dt), accuracy_score(y_val_poly, pred_dt_poly)]
# RandomForest Classifier
rf = RandomForestClassifier()
rf_poly = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_poly.fit(X_train_poly, y_train_poly)
pred_rf = rf.predict(X_val)
pred_rf_poly = rf_poly.predict(X_val_poly)
results.loc['rf'] = [accuracy_score(y_val, pred_rf), accuracy_score(y_val_poly, pred_rf_poly)]
#XGBClassifier
xgb = XGBClassifier()
xgb_poly = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_poly.fit(X_train_poly, y_train_poly)
pred_xgb = xgb.predict(X_val)
pred_xgb_poly = xgb_poly.predict(X_val_poly)
results.loc['xgb'] = [accuracy_score(y_val, pred_xgb), accuracy_score(y_val_poly, pred_xgb_poly)]
results
3. 교차검증
# 교차검증은 train, valid를구분해주므로 이를 합침
X = pd.concat([X_train, X_val],axis=0)
y = list(y_train) + list(y_val)
from sklearn.model_selection import GridSearchCV
# RandomForest
param_grid = dict(n_estimators=[50,100,300],
max_depth=[3,5,10,None])
gridsearch = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, verbose=1, n_jobs=-1)
best_model = gridsearch.fit(X,y)
print("n_estimators :",best_model.best_estimator_.get_params()['n_estimators'])
print("max_depth :", best_model.best_estimator_.get_params()['max_depth'])
# 모델평가하기
from sklearn.model_selection import cross_val_score
cross_val_score(gridsearch, StandardScaler().fit_transform(X), y, cv=3).mean()
# SVM 규제하기
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
svc = LinearSVC(C=1)
param_grid = dict(C=[0.01,0.1,1,10,100])
gridsearch_svc = GridSearchCV(svc, param_grid, cv=5, verbose=1, n_jobs=-1)
cross_val_score(gridsearch_svc, StandardScaler().fit_transform(X), y, cv=3)
best_model.best_estimator_
# 평가지표 변경해보기
svc = LinearSVC(C=100)
cross_val_score(svc, StandardScaler().fit_transform(X), y,
cv=3, n_jobs=-1, scoring='accuracy')
svc = LinearSVC(C=100)
cross_val_score(svc, StandardScaler().fit_transform(X), y,
cv=3, n_jobs=-1, scoring='precision')
svc = LinearSVC(C=100)
cross_val_score(svc, StandardScaler().fit_transform(X), y,
cv=3, n_jobs=-1, scoring='recall')
svc = LinearSVC(C=100)
cross_val_score(svc, StandardScaler().fit_transform(X), y,
cv=3, n_jobs=-1, scoring='f1')
#confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
rf = RandomForestClassifier(n_estimators=100, max_depth=10)
ss = StandardScaler()
ss.fit(X_train)
rf.fit(ss.transform(X_train), y_train)
pred_rf = rf.predict(ss.transform(X_val))
cmat = confusion_matrix(y_val, pred_rf)
print(classification_report(y_val, pred_rf))
print(cmat)
print("accuracy : ", (cmat[0][0]+cmat[1][1])/cmat.sum())
print("precision : ", (cmat[1][1])/(cmat[0][1]+cmat[1][1]))
print("precision2 : ", precision_score(y_val, pred_rf))
# feature importance
high_importances = pd.Series(rf.feature_importances_,
index=X_train.columns).sort_values(ascending=False)
high_importances[:8]
high = high_importances[:8].index
rf = RandomForestClassifier(n_estimators=100, max_depth=10)
cross_val_score(rf, StandardScaler().fit_transform(X[high]), y, cv=3, n_jobs=-1)
▶ 상위 8개 특성으로 해도 결과는 별로 다르지 않음
# ROC_AUC
from sklearn.metrics import roc_curve, roc_auc_score
ss = StandardScaler()
ss.fit(X_train)
logit = LogisticRegression()
logit.fit(ss.transform(X_train), y_train)
target_probabilities = logit.predict_proba(ss.transform(X_val))[:,1]
target_probabilities
false_positive_rate, true_positive_rate, threshold = roc_curve(y_val, target_probabilities)
plt.title("Receiver Operationg Charasteristic")
plt.plot(false_positive_rate, true_positive_rate)
plt.plot([0,1], ls='--')
plt.plot([0,0], [1,0], c='.7'), plt.plot([1,1], c='.7')
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
idx = 2661
print("임계값 :", threshold[idx])
print("True Positive Rate :", true_positive_rate[idx])
print("False Positive Rate :", false_positive_rate[idx])
roc_auc_score(y_val, target_probabilities)
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
ss = StandardScaler()
ss.fit(X_train)
logit = LogisticRegression()
logit.fit(ss.transform(X_train), y_train)
target_probabilities = logit.predict_proba(ss.transform(X_val))[:,1]
target_probabilities
precision, recall, threshold = precision_recall_curve(y_val, target_probabilities)
plt.title("Precision-Recall Curve")
plt.plot(precision, recall)
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.plot([0,1],ls='--')
plt.plot([1,1],c=".7"), plt.plot([1,1],[1,0],c=".7")
plt.show()
print(average_precision_score(y_val, target_probabilities))
Cs=[0.01, 0.1, 1, 10,100]
ss = StandardScaler()
ss.fit(X_train)
for C in Cs:
logit = LogisticRegression(C=C)
logit.fit(ss.transform(X_train), y_train)
target_probabilities = logit.predict_proba(ss.transform(X_val))[:,1]
target_probabilities
precision, recall, threshold = precision_recall_curve(y_val, target_probabilities)
plt.title(f"C={C} Precision-Recall Curve")
plt.plot(precision, recall)
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.plot([0,1],ls='--')
plt.plot([1,1],c=".7"), plt.plot([1,1],[1,0],c=".7")
plt.show()
print(f"------ C = {C} ------")
print(average_precision_score(y_val, target_probabilities))
'STUDY > ADP, 빅데이터분석기사' 카테고리의 다른 글
[pca] iris (0) | 2021.03.21 |
---|---|
[clustering] Mall_Customers (0) | 2021.03.21 |
[ARIMA] airplane (0) | 2021.03.20 |
[classification] titanic (0) | 2021.03.20 |
[regression] restaurant revenue prediction (0) | 2021.03.20 |
Comments