Notice
Recent Posts
Recent Comments
Link
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | ||||||
2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 17 | 18 | 19 | 20 | 21 | 22 |
23 | 24 | 25 | 26 | 27 | 28 |
Tags
- PolynomialFeatures
- ADP
- 토익스피킹
- Quant
- 빅데이터분석기사
- 데이터분석
- sarima
- 코딩테스트
- 변동성돌파전략
- randomforest
- Python
- SQL
- GridSearchCV
- Programmers
- Crawling
- 파트5
- 비트코인
- 볼린저밴드
- lstm
- backtest
- docker
- 프로그래머스
- TimeSeries
- 실기
- 파이썬
- 주식
- 백테스트
- 파이썬 주식
- 데이터분석전문가
- hackerrank
Archives
- Today
- Total
데이터 공부를 기록하는 공간
빅데이터분석기사 실기 - 분류 본문
XGBOOST로 해보기
# 데이터불러오기
X_train = pd.read_csv("C:/Users/###/Downloads/빅데이터분석기사 실기/[Dataset] 작업형 제2유형/X_train.csv",encoding='cp949')
X_test = pd.read_csv("C:/Users/###/Downloads/빅데이터분석기사 실기/[Dataset] 작업형 제2유형/X_test.csv",encoding='cp949')
y_train = pd.read_csv("C:/Users/###/Downloads/빅데이터분석기사 실기/[Dataset] 작업형 제2유형/y_train.csv",encoding='cp949')
print(X_train.shape, X_test.shape, y_train.shape)
X_train.set_index("cust_id", inplace=True)
X_test.set_index("cust_id", inplace=True)
y_train.set_index("cust_id", inplace=True)
# 데이터 전처리
print(X_train.shape, X_test.shape, y_train.shape)
## null data 확인
#print(X_train.isnull().sum().sort_values(ascending=False), X_test.isnull().sum().sort_values(ascending=False), y_train.isnull().sum())
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)
print("NULL Data : " , X_train.isnull().sum().sum(), X_test.isnull().sum().sum(), y_train.isnull().sum().sum())
obj_features = X_train.select_dtypes(include='object').columns.tolist()
num_features = X_train.select_dtypes(exclude='object').columns.tolist()
print(obj_features, num_features)
print("주구매상품 test o , train x : ",[i for i in X_test['주구매상품'].unique().tolist() if i not in X_train['주구매상품'].unique().tolist()])
print("주구매지점 test o , train x : ",[i for i in X_test['주구매지점'].unique().tolist() if i not in X_train['주구매지점'].unique().tolist()])
from sklearn.preprocessing import LabelEncoder
for col in obj_features:
le = LabelEncoder()
le.fit(X_train[col])
X_train[col] = le.transform(X_train[col])
X_test[col] = le.transform(X_test[col])
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer #Normarlizer가 제일 좋은듯
from sklearn.preprocessing import StandardScaler
#for col in X_train.columns.tolist(): #obj_features도 모두 minmaxscaling
for col in num_features:
mm = MinMaxScaler()
mm.fit(np.array(X_train[col]).reshape(-1,1))
X_train[col] = mm.transform(np.array(X_train[col]).reshape(-1,1))
X_test[col] = mm.transform(np.array(X_test[col]).reshape(-1,1))
"""
# one-hot encoding
temp = pd.concat([X_train[obj_features], X_test[obj_features]], axis=1)
temp = pd.get_dummies(temp, dtype=np.int64)
train_temp = temp.iloc[:X_train.shape[0],:]
test_temp = temp.iloc[X_train.shape[0]:,:]
X_train = pd.concat([X_train[num_features], train_temp], axis=1)
X_test = pd.concat([X_test[num_features], test_temp], axis=1)
"""
print(X_train.shape, X_test.shape, y_train.shape)
# 데이터 구분, 모델링
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
#model = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.05)
model = RandomForestClassifier(n_estimators=100, max_depth=15, min_samples_leaf=2)
crossval = cross_val_score(model, X_train, y_train, cv=3, scoring='roc_auc', n_jobs=-1)
print("cross val score : ", np.round(crossval.mean()*100,0),"%")
#params = {"n_estimators":[100,300,500], "max_depth":[3,5,10,None], "learning_rate":[0.01, 0.05, 0.1, 0.3]}
#gridsearch = GridSearchCV(model, param_grid=params, cv=5, n_jobs=-1, verbose=1)
#gridsearch.fit(X_train, y_train)
#print(gridsearch.best_estimator_.get_params())
2. label ▶ one-hot encoding, Linear regression,
# 데이터불러오기
X_train = pd.read_csv("C:/Users/###/Downloads/빅데이터분석기사 실기/[Dataset] 작업형 제2유형/X_train.csv",encoding='cp949')
X_test = pd.read_csv("C:/Users/###/Downloads/빅데이터분석기사 실기/[Dataset] 작업형 제2유형/X_test.csv",encoding='cp949')
y_train = pd.read_csv("C:/Users/###/Downloads/빅데이터분석기사 실기/[Dataset] 작업형 제2유형/y_train.csv",encoding='cp949')
print(X_train.shape, X_test.shape, y_train.shape)
X_train.set_index("cust_id", inplace=True)
X_test.set_index("cust_id", inplace=True)
y_train.set_index("cust_id", inplace=True)
# 데이터 전처리
print(X_train.shape, X_test.shape, y_train.shape)
## null data 확인
#print(X_train.isnull().sum().sort_values(ascending=False), X_test.isnull().sum().sort_values(ascending=False), y_train.isnull().sum())
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)
print("NULL Data : " , X_train.isnull().sum().sum(), X_test.isnull().sum().sum(), y_train.isnull().sum().sum())
obj_features = X_train.select_dtypes(include='object').columns.tolist()
num_features = X_train.select_dtypes(exclude='object').columns.tolist()
print(obj_features, num_features)
print("주구매상품 test o , train x : ",[i for i in X_test['주구매상품'].unique().tolist() if i not in X_train['주구매상품'].unique().tolist()])
print("주구매지점 test o , train x : ",[i for i in X_test['주구매지점'].unique().tolist() if i not in X_train['주구매지점'].unique().tolist()])
"""
from sklearn.preprocessing import LabelEncoder
for col in obj_features:
le = LabelEncoder()
le.fit(X_train[col])
X_train[col] = le.transform(X_train[col])
X_test[col] = le.transform(X_test[col])
"""
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer #Normarlizer가 제일 좋은듯
from sklearn.preprocessing import StandardScaler
#for col in X_train.columns.tolist(): #obj_features도 모두 minmaxscaling
for col in num_features:
mm = MinMaxScaler()
mm.fit(np.array(X_train[col]).reshape(-1,1))
X_train[col] = mm.transform(np.array(X_train[col]).reshape(-1,1))
X_test[col] = mm.transform(np.array(X_test[col]).reshape(-1,1))
# one-hot encoding
temp = pd.concat([X_train[obj_features], X_test[obj_features]], axis=1)
temp = pd.get_dummies(temp, dtype=np.int64)
train_temp = temp.iloc[:X_train.shape[0],:]
test_temp = temp.iloc[X_train.shape[0]:,:]
X_train = pd.concat([X_train[num_features], train_temp], axis=1)
X_test = pd.concat([X_test[num_features], test_temp], axis=1)
print(X_train.shape, X_test.shape, y_train.shape)
# 데이터 구분, 모델링
from sklearn.model_selection import train_test_split
X1, X2, y1, y2 = train_test_split(X_train, y_train, test_size=0.3, shuffle=True, random_state=36)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
results = pd.DataFrame(columns=['roc'])
models = {'knn':KNeighborsClassifier(), 'linear':LinearRegression(), 'logstic':LogisticRegression(C=0.5), 'lasso':Lasso(alpha=0.1), 'svc':SVC(C=0.5), "dt":DecisionTreeClassifier(), "rf":RandomForestClassifier()}#, "xgb":XGBClassifier()}
for key, model in models.items():
model.fit(X1, y1)
pred = model.predict_proba(X2)
roc = roc_auc_score(y2, pred)
#print(key, roc)
results.loc[key] = roc
results
from sklearn.model_selection import cross_val_score
model = LinearRegression()
#model = RandomForestClassifier()
crossvalscore = cross_val_score(model, X_train, y_train, cv=5, n_jobs=-1, scoring='roc_auc')
print("cross val score : ", np.round(crossvalscore.mean()*100,0),"%")
from sklearn.model_selection import GridSearchCV
model = LogisticRegression()
param_grids = {"C":[0.1, 0.5, 1, 10]}
gridsearch = GridSearchCV(model, param_grids, cv=5, n_jobs=-1)
gridsearch.fit(X_train, y_train)
gridsearch.best_estimator_
3. clustering을 추가
# 데이터불러오기
X_train = pd.read_csv("C:/Users/###/Downloads/빅데이터분석기사 실기/[Dataset] 작업형 제2유형/X_train.csv",encoding='cp949')
X_test = pd.read_csv("C:/Users/###/Downloads/빅데이터분석기사 실기/[Dataset] 작업형 제2유형/X_test.csv",encoding='cp949')
y_train = pd.read_csv("C:/Users/###/Downloads/빅데이터분석기사 실기/[Dataset] 작업형 제2유형/y_train.csv",encoding='cp949')
print(X_train.shape, X_test.shape, y_train.shape)
X_train.set_index("cust_id", inplace=True)
X_test.set_index("cust_id", inplace=True)
y_train.set_index("cust_id", inplace=True)
# 데이터 전처리
print(X_train.shape, X_test.shape, y_train.shape)
## null data 확인
#print(X_train.isnull().sum().sort_values(ascending=False), X_test.isnull().sum().sort_values(ascending=False), y_train.isnull().sum())
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)
print("NULL Data : " , X_train.isnull().sum().sum(), X_test.isnull().sum().sum(), y_train.isnull().sum().sum())
obj_features = X_train.select_dtypes(include='object').columns.tolist()
num_features = X_train.select_dtypes(exclude='object').columns.tolist()
print(obj_features, num_features)
print("주구매상품 test o , train x : ",[i for i in X_test['주구매상품'].unique().tolist() if i not in X_train['주구매상품'].unique().tolist()])
print("주구매지점 test o , train x : ",[i for i in X_test['주구매지점'].unique().tolist() if i not in X_train['주구매지점'].unique().tolist()])
"""
from sklearn.preprocessing import LabelEncoder
for col in obj_features:
le = LabelEncoder()
le.fit(X_train[col])
X_train[col] = le.transform(X_train[col])
X_test[col] = le.transform(X_test[col])
"""
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer #Normarlizer가 제일 좋은듯
from sklearn.preprocessing import StandardScaler
#for col in X_train.columns.tolist(): #obj_features도 모두 minmaxscaling
for col in num_features:
mm = MinMaxScaler()
mm.fit(np.array(X_train[col]).reshape(-1,1))
X_train[col] = mm.transform(np.array(X_train[col]).reshape(-1,1))
X_test[col] = mm.transform(np.array(X_test[col]).reshape(-1,1))
# one-hot encoding
temp = pd.concat([X_train[obj_features], X_test[obj_features]], axis=1)
temp = pd.get_dummies(temp, dtype=np.int64)
train_temp = temp.iloc[:X_train.shape[0],:]
test_temp = temp.iloc[X_train.shape[0]:,:]
X_train = pd.concat([X_train[num_features], train_temp], axis=1)
X_test = pd.concat([X_test[num_features], test_temp], axis=1)
# clustering으로 군집화
from sklearn.cluster import KMeans
km = KMeans(n_clusters=5)
X_train['cluster'] = km.fit_predict(X_train)
print(X_train.shape, X_test.shape, y_train.shape)
# 데이터 구분, 모델링
from sklearn.model_selection import train_test_split
X1, X2, y1, y2 = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, random_state=10)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
results = pd.DataFrame(columns=['roc'])
models = {'knn':KNeighborsClassifier(), 'linear':LinearRegression(), 'logstic':LogisticRegression(C=0.5), 'lasso':Lasso(alpha=0.1), 'svc':SVC(C=0.5), "dt":DecisionTreeClassifier(), "rf":RandomForestClassifier()}#, "xgb":XGBClassifier()}
for key, model in models.items():
model.fit(X1, y1)
pred = model.predict_proba(X2)
roc = roc_auc_score(y2, pred)
#print(key, roc)
results.loc[key] = roc
results
'STUDY > ADP, 빅데이터분석기사' 카테고리의 다른 글
adp, 빅데이터분석 기사 합격 (0) | 2021.08.08 |
---|---|
빅데이터분서기사 실기 - XGBoostRegressor (0) | 2021.06.06 |
빅데이터분석기사실기-XGBOOST 분류 (0) | 2021.06.06 |
빅데이터분석기사 실기 예제 - 작업형#1 (0) | 2021.06.05 |
[arima] smp2 (0) | 2021.03.21 |
Comments