데이터 공부를 기록하는 공간

[classification] titanic 본문

STUDY/ADP, 빅데이터분석기사

[classification] titanic

BOTTLE6 2021. 3. 20. 10:51
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('./titanic/train.csv')
test = pd.read_csv('./titanic/test.csv')

1. 데이터 전처리

# check null data
train.isnull().sum()
test.isnull().sum()

# category, numeric feature seperation
target = 'Survived'
train[target].value_counts()
features = train.columns.tolist()[2:] #excluding "Survived"
categorical_f = list(train[features].select_dtypes(include='object').columns) # object
numeric_f = list(train[features].select_dtypes(exclude='object').columns) # int64,float64

train['Cabin'].fillna("Unknown",inplace=True) #Unkown으로 대체
train['Embarked'].fillna("S",inplace=True) #최빈값으로 대체
test['Cabin'].fillna("Unknown",inplace=True) #Unkown으로 대체

train['Age'].fillna(train['Age'].mean(),inplace=True) # 평균값으로 대체
test['Age'].fillna(train['Age'].mean(),inplace=True) # 평균값으로 대체

2. 데이터 시각화

fig, axes = plt.subplots(figsize=(12,8))
sns.kdeplot(data=train['Age'], color='k',label='all')
sns.kdeplot(data=train.loc[train.Survived==1,'Age'], color='r',label='Survived==1')
sns.kdeplot(data=train.loc[train.Survived==0,'Age'], color='b',label='Survived==0')
axes.legend(loc='best')
axes.set_title("Age Histogram")​

# target = classification 
# category features ▶ countplot with hue=target

# train['Cabin'] 앞글자 가져오기
train['Cabin'] = train[train['Cabin']!="Unknown"]['Cabin'].str[0]
test['Cabin'] = test[test['Cabin']!="Unknown"]['Cabin'].str[0]

for col in categorical_f:
    if col != "Name"
        fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,4))
        sns.countplot(x=col, data=train,ax=ax1)
        sns.countplot(x=col, data=train, hue="Survived",ax=ax2)
        
categorical_f.remove('Ticket')
categorical_f.remove('Name')

for col in numeric_f:
    fig, axes = plt.subplots(figsize=(6,4))
    sns.kdeplot(data=train[col], color='k',label='all')
    sns.kdeplot(data=train.loc[train.Survived==1,col], color='r',label='Survived==1')
    sns.kdeplot(data=train.loc[train.Survived==0,col], color='b',label='Survived==0')
    axes.legend(loc='best')
    axes.set_title("{} Histogram".format(col))

train[features].describe().T

 

train[categorical_f].describe().T

test.loc[test['Fare']=='Unknown','Fare'] = 0 #0으로 대체
test['Fare'] = test['Fare'].astype(np.float64)

3. 훈련/테스트 세트 구분

Columntransformer 활용, onehot encoder

### column transformer로 변경하기

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

ct = ColumnTransformer([('scaling', StandardScaler(), 
['Pclass','Age','SibSp','Parch','Fare']),
                       ("onehot", OneHotEncoder(sparse=False), 
                       ['Sex','Cabin','Embarked'])])
                       
data_features = train.drop(['PassengerId','Name','Ticket',"Survived"],axis=1)
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(data_features, y, random_state=0)
ct.fit(X_train)
X_train_trans = ct.transform(X_train)
X_train_trans
                       

4. 모델 교차검증

 

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_trans, y_train)

X_test_trans = ct.transform(X_test)
print("테스트 점수 : {}".format(lr.score(X_test_trans, y_test)))

▶ 테스트 점수 : 0.8026905829596412

 

교차검증 (cross_val_score) : score만 반환

from sklearn.model_selection import cross_val_score

scores = cross_val_score(lr, X_train_trans, y_train, cv=5)
print("교차 검증 점수 : {}".format(scores))
print("교차 검증 점수 평균 : {}".format(scores.mean()))

 

교차검증 (cross_validate) : score 반환 + fit_time, score time 반환

from sklearn.model_selection import cross_validate

res = cross_validate(lr, X_train_trans, y_train, cv=5)
print(pd.DataFrame(res))

매개변수 튜닝

(직접) 교차검증 없이

from sklearn.svm import SVC 

best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        svm = SVC(gamma = gamma, C=C)
        svm.fit(X_train_trans, y_train)
        score = svm.score(X_test_trans, y_test)
        
        if score > best_score:
            best_score = score
            best_parameters= {"C":C, 'gamma':gamma}
            
print("최고 점수 : {}".format(best_score))
print("최적 매개변수 : {}".format(best_parameters))

(GridSearch CV) 교차검증 포함

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
param_grid = {"C":[0.001, 0.01, 0.1, 1, 10, 100], "gamma":[0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(SVC(), param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train_trans, y_train)
print("테스트 세트 점수 : {:.2f}".format(grid_search.score(X_test_trans, y_test)))
print("최적 매개변수 : ", grid_search.best_params_)
print("최고 교차 검증 점수 : {:.2f}".format(grid_search.best_score_))

5. 머신러닝 모델별 학습

① KNN

n_neighbors =3 

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train_trans, y_train)
print("테스트 세트 예측:", knn.predict(X_train_trans))
print("훈련세트 점수 : {:.2f}".format(knn.score(X_train_trans,y_train)))
print("테스트세트 점수 : {:.2f}".format(knn.score(X_test_trans,y_test)))

- 최적 n_neighbors 찾기

train_accuracy = []
test_accuracy = []
for n_neighbors in range(1,20):
    knn = KNeighborsClassifier(n_neighbors = n_neighbors)
    knn.fit(X_train_trans, y_train)
    train_accuracy.append(knn.score(X_train_trans, y_train)) 
    test_accuracy.append(knn.score(X_test_trans, y_test)) 
    
plt.plot(range(1,20), train_accuracy, label="train accuracy")
plt.plot(range(1,20), test_accuracy, label="test accuracy")
plt.ylabel('accuracy')
plt.xlabel('n_neighbors')
plt.legend()

 

 - n=12

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 12)
knn.fit(X_train_trans, y_train)
print("테스트 세트 예측:", knn.predict(X_train_trans))

② Ridge 회귀

from sklearn.linear_model import Ridge

ridge = Ridge().fit(X_train_trans, y_train)
print("train accuracy : {:.2f}".format(ridge.score(X_train_trans, y_train)))
print("test accuracy : {:.2f}".format(ridge.score(X_test_trans, y_test)))

alphas = [0.01, 0.1, 1, 10, 100]
train_accuracy = []
test_accuracy = []
for alpha in alphas:
    ridge=Ridge(alpha=alpha)
    ridge.fit(X_train_trans, y_train)
    train_accuracy.append(ridge.score(X_train_trans, y_train))
    test_accuracy.append(ridge.score(X_test_trans, y_test))

print(train_accuracy)
print(test_accuracy)

ridge = Ridge()
ridge10 = Ridge(alpha=10)
ridge01 = Ridge(alpha=0.1)

ridge.fit(X_train_trans, y_train)
ridge10.fit(X_train_trans, y_train)
ridge01.fit(X_train_trans, y_train)

plt.plot(ridge10.coef_,"^", label='Ridge alpha=10') #제약이 크다. coef는 작다.
plt.plot(ridge.coef_,"s", label='Ridge alpha=1')
plt.plot(ridge01.coef_,"v", label='Ridge alpha=0.1')

plt.xlabel("coef list")
plt.ylabel("coef ")
plt.legend(loc='upper right')
xlims = plt.xlim()
plt.hlines(0, xlims[0], xlims[1])

▶ alpha(=제약)가 클수록(alpha=10) 계수(coef)가 작다.

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, np.where(ridge10.predict(X_test_trans)>0.5 , 1 , 0)))
print(accuracy_score(y_test, np.where(ridge.predict(X_test_trans)>0.5 , 1 , 0)))
print(accuracy_score(y_test, np.where(ridge01.predict(X_test_trans)>0.5 , 1 , 0)))

③ Logistic Regression

from sklearn.linear_model import LogisticRegression

lr100 = LogisticRegression(C=100).fit(X_train_trans, y_train)
print("훈련 세트 점수 : {:.3f}".format(lr100.score(X_train_trans, y_train)))
print("테스트 세트 점수 : {:.3f}".format(lr100.score(X_test_trans, y_test)))

from sklearn.linear_model import LogisticRegression

lr001 = LogisticRegression(C=0.01).fit(X_train_trans, y_train)
print("훈련 세트 점수 : {:.3f}".format(lr001.score(X_train_trans, y_train)))
print("테스트 세트 점수 : {:.3f}".format(lr001.score(X_test_trans, y_test)))

for C,marker in zip([0.001, 1, 100], ['^','o','v']):
    lr_l2 = LogisticRegression(C=C, penalty = 'l2')
    lr_l2 = lr_l2.fit(X_train_trans, y_train)
    print("C={:.3f} l2 로지스틱회귀 훈련 정확도 : {:.2f}".format(C, lr_l2.score(X_train_trans, y_train)))
    print("C={:.3f} l2 로지스틱회귀 테스트 정확도 : {:.2f}".format(C, lr_l2.score(X_test_trans, y_test)))

④ 결정트리

from sklearn.tree import DecisionTreeClassifier

tree=DecisionTreeClassifier()
tree.fit(X_train_trans, y_train)
print("train score : {:.3f}".format(tree.score(X_train_trans,y_train)))
print("test score : {:.3f}".format(tree.score(X_test_trans,y_test)))

tree.feature_importances_

⑤ RandomForest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier().fit(X_train_trans, y_train)
print("train accuracy : {:.3f}".format(rf.score(X_train_trans, y_train)))
print("test accuracy : {:.3f}".format(rf.score(X_test_trans, y_test)))

 -  parameter 변화

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, max_depth = 10, random_state=0,n_jobs=-1)
rf.fit(X_train_trans, y_train)
print("train accuracy : {:.3f}".format(rf.score(X_train_trans, y_train)))
print("test accuracy : {:.3f}".format(rf.score(X_test_trans, y_test)))

 

⑥ Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train_trans, y_train)

print("훈련 세트 정확도 : {:.3f}".format(gbrt.score(X_train_trans, y_train)))
print("테스트 세트 정확도 : {:.3f}".format(gbrt.score(X_test_trans, y_test)))

from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier(random_state=0, max_depth=5, learning_rate=0.05)
gbrt.fit(X_train_trans, y_train)

print("훈련 세트 정확도 : {:.3f}".format(gbrt.score(X_train_trans, y_train)))
print("테스트 세트 정확도 : {:.3f}".format(gbrt.score(X_test_trans, y_test)))

⑦ Bagging

from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression

bagging = BaggingClassifier(LogisticRegression(), n_estimators=100, random_state=0)
bagging.fit(X_train_trans, y_train)

print("훈련 세트 정확도 : {:.3f}".format(bagging.score(X_train_trans, y_train)))
print("테스트 세트 정확도 : {:.3f}".format(bagging.score(X_test_trans, y_test)))

6. 기타 추가 _ 파이프라인

from sklearn.pipeline import Pipeline
pipe = Pipeline([('scaler', StandardScaler()), ('svm',SVC())])

pipe.fit(X_train_trans, y_train)
pipe.score(X_train_trans, y_train)

from sklearn.pipeline import make_pipeline
pipe_short = make_pipeline(StandardScaler(), SVC(C=100))
pipe_short.steps

 

pred_y = pipe_short.fit(X_train_trans, y_train).predict(X_test_trans)
accuracy_score(pred_y, y_test)

7. 기타 추가 _ 상호작용

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2).fit(X_train_trans)
X_train_poly = poly.transform(X_train_trans)
X_test_poly = poly.transform(X_test_trans)
print("X_train_trans.shape: ", X_train_trans.shape)
print("X_train_poly.shape: ", X_train_poly.shape)

#from sklearn.linear_model import LogisticRegreession

lr = LogisticRegression().fit(X_train_poly, y_train)
print("훈련 점수 : {:.3f}".format(lr.score(X_train_poly, y_train)))
print("테스트 점수 : {:.3f}".format(lr.score(X_test_poly, y_test)))

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, max_depth = 10, random_state=0,n_jobs=-1)
rf.fit(X_train_poly, y_train)
print("train accuracy : {:.3f}".format(rf.score(X_train_poly, y_train)))
print("test accuracy : {:.3f}".format(rf.score(X_test_poly, y_test)))

pd.Series(rf.feature_importances_, index=poly.get_feature_names()).sort_values(ascending=False)

Comments