Notice
Recent Posts
Recent Comments
Link
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | ||||||
2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 17 | 18 | 19 | 20 | 21 | 22 |
23 | 24 | 25 | 26 | 27 | 28 |
Tags
- 실기
- 비트코인
- 파이썬 주식
- TimeSeries
- PolynomialFeatures
- Python
- 파트5
- Crawling
- 주식
- 빅데이터분석기사
- 백테스트
- hackerrank
- randomforest
- 데이터분석
- backtest
- docker
- 데이터분석전문가
- ADP
- 파이썬
- 토익스피킹
- 코딩테스트
- 프로그래머스
- SQL
- GridSearchCV
- 볼린저밴드
- Quant
- lstm
- Programmers
- 변동성돌파전략
- sarima
Archives
- Today
- Total
데이터 공부를 기록하는 공간
[regression] restaurant revenue prediction 본문
kaggle > restaurant revenue
1. EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns=None
train_df = pd.read_csv("./restaurant-revenue-prediction/train.csv")
test_df = pd.read_csv("./restaurant-revenue-prediction/test.csv")
train_df['part'] = 'train'
test_df['part'] = 'test'
# target variable
from scipy import stats
from scipy.stats import norm, skew
(mu, sigma) = norm.fit(train_df['revenue'])
f, (ax1,ax2) = plt.subplots(1,2, figsize=(15,5))
ax1 = sns.distplot(train_df['revenue'], fit=norm, ax=ax1)
ax1.set_ylabel('Frequency')
ax2 = stats.probplot(train_df['revenue'], plot=plt)
▶ target variable is right skewed. we can use log transformation
▶ There are outliers which is over there are outliers which value is over 14000000.
cond = (train_df['revenue']<14000000)# & (train_df['revenue']>14.2)
train_df = train_df[cond]
df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
print(train_df.shape, test_df.shape, df.shape)
# handling "Open Date" feature
df['Open Date'] = pd.to_datetime(df['Open Date'])
df['open_month'] = [x.month for x in df['Open Date']]
df['open_year'] = [x.year for x in df['Open Date']]
df.drop('Open Date', axis=1, inplace=True)
df.drop('Id', axis=1, inplace=True)
#feature separation
part = 'part'
target = 'revenue'
category_features = ['City','City Group','Type','open_year','open_month']
columns = df.columns.tolist()
numeric_features = []
for col in columns:
if col not in category_features :
if (col != target) & (col != part):
numeric_features.append(col)
# category features visualization
for cat in category_features:
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(10,5))
sns.countplot(df.loc[df.part=='train',cat], ax=ax1)
sns.countplot(df.loc[df.part=='test',cat], ax=ax2)
plt.show()
이하 생략
# City 변수는 너무 많은 카테고리를 가지고 있어 변수로 활용하기 어려워 제거
category_features.remove('City')
# Test 데이터에만 MB가 있으므로, MB를 최빈값인 FC로 변경
df.loc[df['Type']=='MB','Type']='FC'
# Numeric Features handling
plt.figure(figsize=(15,10))
corrmat = df.loc[df.part=='train',numeric_features].corr()
sns.heatmap(corrmat, vmax=0.8,annot=True)
plt.title("train data correlation")
a = [14,15,16,17,18,24,25,26,27,30,31,32,33,34,35,36,37]
b = ["P"+str(x) for x in a]
b.append('revenue')
df.loc[df.part=='train',b].corr()['revenue'].sort_values(ascending=False)
# ▶ P17이 가장 상관계수가 높아 하나만 남기고 나머지 삭제
b.remove('revenue')
b.remove('P17')
df.drop(columns=b, axis=1, inplace=True)
# 2nd elimination
part = 'part'
target = 'revenue'
category_features = ['City','City Group','Type','open_year','open_month']
columns = df.columns.tolist()
numeric_features = []
for col in columns:
if col not in category_features :
if (col != target) & (col != part):
numeric_features.append(col)
plt.figure(figsize=(12,8))
sns.heatmap(df[df.part=='train'].corr(),vmax=0.8,annot=True)
a = [1,2,3,4,7,8,9,10,11,12,13,17,19,20,23,28]
b = ["P"+str(x) for x in a]
b.append('revenue')
df.loc[df.part=='train',b].corr()['revenue'].sort_values(ascending=False)
b.remove('P2')
b.remove('revenue')
df.drop(columns=b, axis=1, inplace=True)
plt.figure(figsize=(12,8))
sns.heatmap(df[df.part=='train'].corr(),vmax=0.8,annot=True)
part = 'part'
target = 'revenue'
category_features = ['City','City Group','Type','open_year','open_month']
columns = df.columns.tolist()
numeric_features = []
for col in columns:
if col not in category_features :
if (col != target) & (col != part):
numeric_features.append(col)
for num in numeric_features:
fig, ax = plt.subplots(figsize=(10,5))
sns.regplot(x=df.loc[df.part=='train',num],y=df.loc[df.part=='train','revenue'], ax=ax)
plt.show()
for num in numeric_features:
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(15,5))
sns.distplot(df.loc[df.part=='train',num], ax=ax1)
sns.distplot(df.loc[df.part=='test',num], ax=ax2)
plt.show()
# one hot encoding
ohe_columns = ['City Group','Type']
ohe_df = pd.get_dummies(df[ohe_columns])
df = pd.concat([df, ohe_df],axis=1)
df.drop(['City','City Group','Type'],axis=1,inplace=True)
## modeling
from sklearn.preprocessing import PolynomialFeatures
poly_columns = ['P2','P5','P6','P21','P22','P29']
poly = PolynomialFeatures(degree=2, include_bias=False)
poly.fit(df[poly_columns])
df_columns_poly = poly.transform(df[poly_columns])
df_columns_poly = pd.DataFrame(df_columns_poly , columns = poly.get_feature_names())
# merge
df = pd.concat([df, df_columns_poly],axis=1)
df.drop(columns=['P2','P5','P6','P22','P29'],axis=1,inplace=True)
df.drop(columns=['P21'],axis=1, inplace=True)
num_columns = df.columns.tolist()[14:]
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
for col in num_columns:
df[col] = ss.fit_transform(np.array(df[col]).reshape(-1,1))
df['Type_DT'] = df['Type_DT'].astype(np.int64)
aa = df.columns.tolist()[4:9]
for col in aa:
df[col] = df[col].astype(np.int64)
train = df[df.part=='train']
test = df[df.part=='test']
target = df.loc[df.part=='train','revenue']
train.drop(['part','revenue'],axis=1,inplace=True)
test.drop(['part','revenue'],axis=1,inplace=True)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.3, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
regressors = {
"Linear Regression" : LinearRegression(),
"Logistic Regression" : LogisticRegression(),
"Decision Tree" : DecisionTreeRegressor(),
"Random Forest" : RandomForestRegressor(),
"SVM" : SVR(),
"KNN" : KNeighborsRegressor(),
"XGBoost" : XGBRegressor()
}
results = pd.DataFrame(columns=['MAE','MSE','R2-score'])
for method, func in regressors.items():
func.fit(X_train, y_train)
pred = func.predict(X_test)
results.loc[method] = [mean_absolute_error(y_test,pred),
mean_squared_error(y_test, pred),
r2_score(y_test, pred)]
results
from sklearn.model_selection import GridSearchCV
knn = KNeighborsRegressor()
n_neighbors = [1,2,3,5,7,9,15]
hyperparameters = dict(n_neighbors=n_neighbors)
gridsearch = GridSearchCV(knn, hyperparameters, cv=5, verbose=0)
best_model = gridsearch.fit(train, target)
best_model
pred_knn_best = best_model.predict(test)
submission = pd.DataFrame(columns=['Id','Prediction'])
submission['Id'] = test_df['Id']
submission['Prediction'] = pred_knn_best
submission.to_csv('{}_restaurant_knn_n9.csv'.format(np.round(score,3)),index=False)
'STUDY > ADP, 빅데이터분석기사' 카테고리의 다른 글
[ARIMA] airplane (0) | 2021.03.20 |
---|---|
[classification] titanic (0) | 2021.03.20 |
[multi-classification] forest_cover_type (0) | 2021.03.19 |
[regression] house_price_prediction (0) | 2021.03.19 |
ADP 실기문제 그루핑 (0) | 2021.03.13 |
Comments