데이터 공부를 기록하는 공간

[regression] restaurant revenue prediction 본문

STUDY/ADP, 빅데이터분석기사

[regression] restaurant revenue prediction

BOTTLE6 2021. 3. 20. 00:12

kaggle > restaurant revenue

 

1. EDA

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns=None

train_df = pd.read_csv("./restaurant-revenue-prediction/train.csv")
test_df = pd.read_csv("./restaurant-revenue-prediction/test.csv")

train_df['part'] = 'train'
test_df['part'] = 'test'
# target variable

from scipy import stats
from scipy.stats import norm, skew

(mu, sigma) = norm.fit(train_df['revenue'])
f, (ax1,ax2) = plt.subplots(1,2, figsize=(15,5))
ax1 = sns.distplot(train_df['revenue'], fit=norm, ax=ax1)
ax1.set_ylabel('Frequency')
ax2 = stats.probplot(train_df['revenue'], plot=plt)

▶ target variable is right skewed. we can use log transformation

▶ There are outliers which is over there are outliers which value is over 14000000.

 

cond = (train_df['revenue']<14000000)# & (train_df['revenue']>14.2)
train_df = train_df[cond]

df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
print(train_df.shape, test_df.shape, df.shape)

# handling "Open Date" feature
df['Open Date'] = pd.to_datetime(df['Open Date'])
df['open_month'] = [x.month for x in df['Open Date']]
df['open_year'] = [x.year for x in df['Open Date']]
df.drop('Open Date', axis=1, inplace=True)

df.drop('Id', axis=1, inplace=True)

#feature separation
part = 'part'
target = 'revenue'
category_features = ['City','City Group','Type','open_year','open_month']

columns = df.columns.tolist()
numeric_features = []
for col in columns:
    if col not in category_features :
        if (col != target) & (col != part):
            numeric_features.append(col)
# category features visualization

for cat in category_features:
    fig, (ax1, ax2) = plt.subplots(1,2,figsize=(10,5))
    sns.countplot(df.loc[df.part=='train',cat], ax=ax1)
    sns.countplot(df.loc[df.part=='test',cat], ax=ax2)
    plt.show()

이하 생략

 

# City 변수는 너무 많은 카테고리를 가지고 있어 변수로 활용하기 어려워 제거
category_features.remove('City')
# Test 데이터에만 MB가 있으므로, MB를 최빈값인 FC로 변경
df.loc[df['Type']=='MB','Type']='FC'

 

# Numeric Features handling
plt.figure(figsize=(15,10))
corrmat = df.loc[df.part=='train',numeric_features].corr()
sns.heatmap(corrmat, vmax=0.8,annot=True)
plt.title("train data correlation")

a = [14,15,16,17,18,24,25,26,27,30,31,32,33,34,35,36,37]
b = ["P"+str(x) for x in a]
b.append('revenue')
df.loc[df.part=='train',b].corr()['revenue'].sort_values(ascending=False)
# ▶ P17이 가장 상관계수가 높아 하나만 남기고 나머지 삭제
b.remove('revenue')
b.remove('P17')
df.drop(columns=b, axis=1, inplace=True)

# 2nd elimination
part = 'part'
target = 'revenue'
category_features = ['City','City Group','Type','open_year','open_month']

columns = df.columns.tolist()
numeric_features = []
for col in columns:
    if col not in category_features :
        if (col != target) & (col != part):
            numeric_features.append(col)
            
plt.figure(figsize=(12,8))
sns.heatmap(df[df.part=='train'].corr(),vmax=0.8,annot=True)

a = [1,2,3,4,7,8,9,10,11,12,13,17,19,20,23,28]
b = ["P"+str(x) for x in a]
b.append('revenue')
df.loc[df.part=='train',b].corr()['revenue'].sort_values(ascending=False)
b.remove('P2')
b.remove('revenue')
df.drop(columns=b, axis=1, inplace=True)

plt.figure(figsize=(12,8))
sns.heatmap(df[df.part=='train'].corr(),vmax=0.8,annot=True)

part = 'part'
target = 'revenue'
category_features = ['City','City Group','Type','open_year','open_month']

columns = df.columns.tolist()
numeric_features = []
for col in columns:
    if col not in category_features :
        if (col != target) & (col != part):
            numeric_features.append(col)

for num in numeric_features:
    fig, ax = plt.subplots(figsize=(10,5))
    sns.regplot(x=df.loc[df.part=='train',num],y=df.loc[df.part=='train','revenue'], ax=ax)
    plt.show()
    
for num in numeric_features:
    fig, (ax1,ax2) = plt.subplots(1,2, figsize=(15,5))
    sns.distplot(df.loc[df.part=='train',num], ax=ax1)
    sns.distplot(df.loc[df.part=='test',num], ax=ax2)
    plt.show()

 

# one hot encoding

ohe_columns = ['City Group','Type']
ohe_df = pd.get_dummies(df[ohe_columns])

df = pd.concat([df, ohe_df],axis=1)
df.drop(['City','City Group','Type'],axis=1,inplace=True)
## modeling
from sklearn.preprocessing import PolynomialFeatures
poly_columns = ['P2','P5','P6','P21','P22','P29']
poly = PolynomialFeatures(degree=2, include_bias=False)
poly.fit(df[poly_columns])
df_columns_poly = poly.transform(df[poly_columns])
df_columns_poly = pd.DataFrame(df_columns_poly , columns = poly.get_feature_names())

# merge
df = pd.concat([df, df_columns_poly],axis=1)
df.drop(columns=['P2','P5','P6','P22','P29'],axis=1,inplace=True)
df.drop(columns=['P21'],axis=1, inplace=True)

num_columns = df.columns.tolist()[14:]

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
for col in num_columns:
    df[col] = ss.fit_transform(np.array(df[col]).reshape(-1,1))
    
df['Type_DT'] = df['Type_DT'].astype(np.int64) 
aa = df.columns.tolist()[4:9]
for col in aa:
    df[col] = df[col].astype(np.int64)

train = df[df.part=='train']
test = df[df.part=='test']
target = df.loc[df.part=='train','revenue']

train.drop(['part','revenue'],axis=1,inplace=True)
test.drop(['part','revenue'],axis=1,inplace=True)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.3, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

regressors = {
    "Linear Regression" : LinearRegression(),
    "Logistic Regression" : LogisticRegression(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Random Forest" : RandomForestRegressor(), 
    "SVM" : SVR(),
    "KNN" : KNeighborsRegressor(),
    "XGBoost" : XGBRegressor()
}

results = pd.DataFrame(columns=['MAE','MSE','R2-score'])
for method, func in regressors.items():
    func.fit(X_train, y_train)
    pred = func.predict(X_test)
    results.loc[method] = [mean_absolute_error(y_test,pred),
                         mean_squared_error(y_test, pred),
                         r2_score(y_test, pred)]
results

from sklearn.model_selection import GridSearchCV

knn = KNeighborsRegressor()
n_neighbors = [1,2,3,5,7,9,15]
hyperparameters = dict(n_neighbors=n_neighbors)
gridsearch = GridSearchCV(knn, hyperparameters, cv=5, verbose=0)
best_model = gridsearch.fit(train, target)
best_model

pred_knn_best = best_model.predict(test)
submission = pd.DataFrame(columns=['Id','Prediction'])
submission['Id'] = test_df['Id']
submission['Prediction'] = pred_knn_best
submission.to_csv('{}_restaurant_knn_n9.csv'.format(np.round(score,3)),index=False)

 

'STUDY > ADP, 빅데이터분석기사' 카테고리의 다른 글

[ARIMA] airplane  (0) 2021.03.20
[classification] titanic  (0) 2021.03.20
[multi-classification] forest_cover_type  (0) 2021.03.19
[regression] house_price_prediction  (0) 2021.03.19
ADP 실기문제 그루핑  (0) 2021.03.13
Comments