Notice
Recent Posts
Recent Comments
Link
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | ||||||
2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 17 | 18 | 19 | 20 | 21 | 22 |
23 | 24 | 25 | 26 | 27 | 28 |
Tags
- PolynomialFeatures
- Crawling
- 변동성돌파전략
- 파트5
- 데이터분석전문가
- backtest
- TimeSeries
- sarima
- lstm
- randomforest
- Quant
- Python
- 백테스트
- 프로그래머스
- 코딩테스트
- docker
- ADP
- GridSearchCV
- hackerrank
- 파이썬
- 파이썬 주식
- 실기
- 볼린저밴드
- Programmers
- 비트코인
- 주식
- 빅데이터분석기사
- 토익스피킹
- 데이터분석
- SQL
Archives
- Today
- Total
데이터 공부를 기록하는 공간
[classification] titanic 본문
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('./titanic/train.csv')
test = pd.read_csv('./titanic/test.csv')
1. 데이터 전처리
# check null data
train.isnull().sum()
test.isnull().sum()
# category, numeric feature seperation
target = 'Survived'
train[target].value_counts()
features = train.columns.tolist()[2:] #excluding "Survived"
categorical_f = list(train[features].select_dtypes(include='object').columns) # object
numeric_f = list(train[features].select_dtypes(exclude='object').columns) # int64,float64
train['Cabin'].fillna("Unknown",inplace=True) #Unkown으로 대체
train['Embarked'].fillna("S",inplace=True) #최빈값으로 대체
test['Cabin'].fillna("Unknown",inplace=True) #Unkown으로 대체
train['Age'].fillna(train['Age'].mean(),inplace=True) # 평균값으로 대체
test['Age'].fillna(train['Age'].mean(),inplace=True) # 평균값으로 대체
2. 데이터 시각화
fig, axes = plt.subplots(figsize=(12,8))
sns.kdeplot(data=train['Age'], color='k',label='all')
sns.kdeplot(data=train.loc[train.Survived==1,'Age'], color='r',label='Survived==1')
sns.kdeplot(data=train.loc[train.Survived==0,'Age'], color='b',label='Survived==0')
axes.legend(loc='best')
axes.set_title("Age Histogram")
# target = classification
# category features ▶ countplot with hue=target
# train['Cabin'] 앞글자 가져오기
train['Cabin'] = train[train['Cabin']!="Unknown"]['Cabin'].str[0]
test['Cabin'] = test[test['Cabin']!="Unknown"]['Cabin'].str[0]
for col in categorical_f:
if col != "Name"
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,4))
sns.countplot(x=col, data=train,ax=ax1)
sns.countplot(x=col, data=train, hue="Survived",ax=ax2)
categorical_f.remove('Ticket')
categorical_f.remove('Name')
for col in numeric_f:
fig, axes = plt.subplots(figsize=(6,4))
sns.kdeplot(data=train[col], color='k',label='all')
sns.kdeplot(data=train.loc[train.Survived==1,col], color='r',label='Survived==1')
sns.kdeplot(data=train.loc[train.Survived==0,col], color='b',label='Survived==0')
axes.legend(loc='best')
axes.set_title("{} Histogram".format(col))
train[features].describe().T
train[categorical_f].describe().T
test.loc[test['Fare']=='Unknown','Fare'] = 0 #0으로 대체
test['Fare'] = test['Fare'].astype(np.float64)
3. 훈련/테스트 세트 구분
Columntransformer 활용, onehot encoder
### column transformer로 변경하기
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
ct = ColumnTransformer([('scaling', StandardScaler(),
['Pclass','Age','SibSp','Parch','Fare']),
("onehot", OneHotEncoder(sparse=False),
['Sex','Cabin','Embarked'])])
data_features = train.drop(['PassengerId','Name','Ticket',"Survived"],axis=1)
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(data_features, y, random_state=0)
ct.fit(X_train)
X_train_trans = ct.transform(X_train)
X_train_trans
4. 모델 교차검증
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_trans, y_train)
X_test_trans = ct.transform(X_test)
print("테스트 점수 : {}".format(lr.score(X_test_trans, y_test)))
▶ 테스트 점수 : 0.8026905829596412
교차검증 (cross_val_score) : score만 반환
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lr, X_train_trans, y_train, cv=5)
print("교차 검증 점수 : {}".format(scores))
print("교차 검증 점수 평균 : {}".format(scores.mean()))
교차검증 (cross_validate) : score 반환 + fit_time, score time 반환
from sklearn.model_selection import cross_validate
res = cross_validate(lr, X_train_trans, y_train, cv=5)
print(pd.DataFrame(res))
매개변수 튜닝
(직접) 교차검증 없이
from sklearn.svm import SVC
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
svm = SVC(gamma = gamma, C=C)
svm.fit(X_train_trans, y_train)
score = svm.score(X_test_trans, y_test)
if score > best_score:
best_score = score
best_parameters= {"C":C, 'gamma':gamma}
print("최고 점수 : {}".format(best_score))
print("최적 매개변수 : {}".format(best_parameters))
(GridSearch CV) 교차검증 포함
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
param_grid = {"C":[0.001, 0.01, 0.1, 1, 10, 100], "gamma":[0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(SVC(), param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train_trans, y_train)
print("테스트 세트 점수 : {:.2f}".format(grid_search.score(X_test_trans, y_test)))
print("최적 매개변수 : ", grid_search.best_params_)
print("최고 교차 검증 점수 : {:.2f}".format(grid_search.best_score_))
5. 머신러닝 모델별 학습
① KNN
n_neighbors =3
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train_trans, y_train)
print("테스트 세트 예측:", knn.predict(X_train_trans))
print("훈련세트 점수 : {:.2f}".format(knn.score(X_train_trans,y_train)))
print("테스트세트 점수 : {:.2f}".format(knn.score(X_test_trans,y_test)))
- 최적 n_neighbors 찾기
train_accuracy = []
test_accuracy = []
for n_neighbors in range(1,20):
knn = KNeighborsClassifier(n_neighbors = n_neighbors)
knn.fit(X_train_trans, y_train)
train_accuracy.append(knn.score(X_train_trans, y_train))
test_accuracy.append(knn.score(X_test_trans, y_test))
plt.plot(range(1,20), train_accuracy, label="train accuracy")
plt.plot(range(1,20), test_accuracy, label="test accuracy")
plt.ylabel('accuracy')
plt.xlabel('n_neighbors')
plt.legend()
- n=12
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 12)
knn.fit(X_train_trans, y_train)
print("테스트 세트 예측:", knn.predict(X_train_trans))
② Ridge 회귀
from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train_trans, y_train)
print("train accuracy : {:.2f}".format(ridge.score(X_train_trans, y_train)))
print("test accuracy : {:.2f}".format(ridge.score(X_test_trans, y_test)))
alphas = [0.01, 0.1, 1, 10, 100]
train_accuracy = []
test_accuracy = []
for alpha in alphas:
ridge=Ridge(alpha=alpha)
ridge.fit(X_train_trans, y_train)
train_accuracy.append(ridge.score(X_train_trans, y_train))
test_accuracy.append(ridge.score(X_test_trans, y_test))
print(train_accuracy)
print(test_accuracy)
ridge = Ridge()
ridge10 = Ridge(alpha=10)
ridge01 = Ridge(alpha=0.1)
ridge.fit(X_train_trans, y_train)
ridge10.fit(X_train_trans, y_train)
ridge01.fit(X_train_trans, y_train)
plt.plot(ridge10.coef_,"^", label='Ridge alpha=10') #제약이 크다. coef는 작다.
plt.plot(ridge.coef_,"s", label='Ridge alpha=1')
plt.plot(ridge01.coef_,"v", label='Ridge alpha=0.1')
plt.xlabel("coef list")
plt.ylabel("coef ")
plt.legend(loc='upper right')
xlims = plt.xlim()
plt.hlines(0, xlims[0], xlims[1])
▶ alpha(=제약)가 클수록(alpha=10) 계수(coef)가 작다.
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, np.where(ridge10.predict(X_test_trans)>0.5 , 1 , 0)))
print(accuracy_score(y_test, np.where(ridge.predict(X_test_trans)>0.5 , 1 , 0)))
print(accuracy_score(y_test, np.where(ridge01.predict(X_test_trans)>0.5 , 1 , 0)))
③ Logistic Regression
from sklearn.linear_model import LogisticRegression
lr100 = LogisticRegression(C=100).fit(X_train_trans, y_train)
print("훈련 세트 점수 : {:.3f}".format(lr100.score(X_train_trans, y_train)))
print("테스트 세트 점수 : {:.3f}".format(lr100.score(X_test_trans, y_test)))
from sklearn.linear_model import LogisticRegression
lr001 = LogisticRegression(C=0.01).fit(X_train_trans, y_train)
print("훈련 세트 점수 : {:.3f}".format(lr001.score(X_train_trans, y_train)))
print("테스트 세트 점수 : {:.3f}".format(lr001.score(X_test_trans, y_test)))
for C,marker in zip([0.001, 1, 100], ['^','o','v']):
lr_l2 = LogisticRegression(C=C, penalty = 'l2')
lr_l2 = lr_l2.fit(X_train_trans, y_train)
print("C={:.3f} l2 로지스틱회귀 훈련 정확도 : {:.2f}".format(C, lr_l2.score(X_train_trans, y_train)))
print("C={:.3f} l2 로지스틱회귀 테스트 정확도 : {:.2f}".format(C, lr_l2.score(X_test_trans, y_test)))
④ 결정트리
from sklearn.tree import DecisionTreeClassifier
tree=DecisionTreeClassifier()
tree.fit(X_train_trans, y_train)
print("train score : {:.3f}".format(tree.score(X_train_trans,y_train)))
print("test score : {:.3f}".format(tree.score(X_test_trans,y_test)))
tree.feature_importances_
⑤ RandomForest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier().fit(X_train_trans, y_train)
print("train accuracy : {:.3f}".format(rf.score(X_train_trans, y_train)))
print("test accuracy : {:.3f}".format(rf.score(X_test_trans, y_test)))
- parameter 변화
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth = 10, random_state=0,n_jobs=-1)
rf.fit(X_train_trans, y_train)
print("train accuracy : {:.3f}".format(rf.score(X_train_trans, y_train)))
print("test accuracy : {:.3f}".format(rf.score(X_test_trans, y_test)))
⑥ Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train_trans, y_train)
print("훈련 세트 정확도 : {:.3f}".format(gbrt.score(X_train_trans, y_train)))
print("테스트 세트 정확도 : {:.3f}".format(gbrt.score(X_test_trans, y_test)))
from sklearn.ensemble import GradientBoostingClassifier
gbrt = GradientBoostingClassifier(random_state=0, max_depth=5, learning_rate=0.05)
gbrt.fit(X_train_trans, y_train)
print("훈련 세트 정확도 : {:.3f}".format(gbrt.score(X_train_trans, y_train)))
print("테스트 세트 정확도 : {:.3f}".format(gbrt.score(X_test_trans, y_test)))
⑦ Bagging
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
bagging = BaggingClassifier(LogisticRegression(), n_estimators=100, random_state=0)
bagging.fit(X_train_trans, y_train)
print("훈련 세트 정확도 : {:.3f}".format(bagging.score(X_train_trans, y_train)))
print("테스트 세트 정확도 : {:.3f}".format(bagging.score(X_test_trans, y_test)))
6. 기타 추가 _ 파이프라인
from sklearn.pipeline import Pipeline
pipe = Pipeline([('scaler', StandardScaler()), ('svm',SVC())])
pipe.fit(X_train_trans, y_train)
pipe.score(X_train_trans, y_train)
from sklearn.pipeline import make_pipeline
pipe_short = make_pipeline(StandardScaler(), SVC(C=100))
pipe_short.steps
pred_y = pipe_short.fit(X_train_trans, y_train).predict(X_test_trans)
accuracy_score(pred_y, y_test)
7. 기타 추가 _ 상호작용
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2).fit(X_train_trans)
X_train_poly = poly.transform(X_train_trans)
X_test_poly = poly.transform(X_test_trans)
print("X_train_trans.shape: ", X_train_trans.shape)
print("X_train_poly.shape: ", X_train_poly.shape)
#from sklearn.linear_model import LogisticRegreession
lr = LogisticRegression().fit(X_train_poly, y_train)
print("훈련 점수 : {:.3f}".format(lr.score(X_train_poly, y_train)))
print("테스트 점수 : {:.3f}".format(lr.score(X_test_poly, y_test)))
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth = 10, random_state=0,n_jobs=-1)
rf.fit(X_train_poly, y_train)
print("train accuracy : {:.3f}".format(rf.score(X_train_poly, y_train)))
print("test accuracy : {:.3f}".format(rf.score(X_test_poly, y_test)))
pd.Series(rf.feature_importances_, index=poly.get_feature_names()).sort_values(ascending=False)
'STUDY > ADP, 빅데이터분석기사' 카테고리의 다른 글
[classification] STAY or LEAVE (0) | 2021.03.21 |
---|---|
[ARIMA] airplane (0) | 2021.03.20 |
[regression] restaurant revenue prediction (0) | 2021.03.20 |
[multi-classification] forest_cover_type (0) | 2021.03.19 |
[regression] house_price_prediction (0) | 2021.03.19 |
Comments