STUDY/ADP, 빅데이터분석기사

[regression] house_price_prediction

BOTTLE6 2021. 3. 19. 22:14

1. EDA 및 데이터 전처리

2. 다중공선성 처리

3. Train/Valid/Test set으로 분리하기

4. 교호작용 고려 모델링 및 평가

 

# library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# data 
train = pd.read_csv("./house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("./house-prices-advanced-regression-techniques/test.csv")

# data shape
print(train.shape, test.shape)

# data seperation
target = 'SalePrice'
features = test.columns.tolist()
features.remove('Id')

# target visulaization
train[target].plot.hist(bins=100)

1. EDA 및 데이터 전처리

# check null data
columns_withnull = train.columns[train.isnull().sum()!=0].tolist()
print(train.isnull().sum().sort_values(ascending=False).head(20))

#remove features that have over 1000 null datas
features.remove('PoolQC')
features.remove('MiscFeature')
features.remove('Alley')
features.remove('Fence')

#feature seperations
category_f = train[features].select_dtypes(include='object').columns.tolist()
num_f = train[features].select_dtypes(exclude='object').columns.tolist()
print("category_f 개수 : ", len(category_f))
print("num_f 개수 : ", len(num_f))

import warnings
warnings.filterwarnings('ignore')

# show numeric_features
for col in num_f:
    train[col].fillna(train[col].mean(),inplace=True) # handling null data
    fig, ax = plt.subplots(figsize=(6,2))
    sns.distplot(train[col])
    ax.set_title("{}".format(col))
    
# another way to show numeric_features
'''
import time
tic = time.time()
f = pd.melt(train, value_vars = num_f)
g = sns.FacetGrid(f, col='variable', col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")
print(time.time()-tic, "seconds")
'''

# show category_features
for col in category_f:
    train[col].fillna("Unknown",inplace=True) # handling null data 
    print('-'*50)
    print("<< " + str(col) + " >>")
    print(train[col].value_counts())
    
# show category_features by boxplot
for c in category_f:
    train[c] = train[c].astype("category")
    if train[c].isnull().any():
        train[c] = train[c].cat.add_categories(['MISSING']) 
        train[c] = train[c].fillna("MISSING") # handling null data 
def boxplot(x, y, **kwargs):
    sns.boxplot(x=x, y=y)
    x = plt.xticks(rotation=90)
f = pd.melt(train, id_vars = ['SalePrice'], value_vars = category_f)
g = sns.FacetGrid(f, col='variable', col_wrap = 2, sharex=False, sharey=False)
g = g.map(boxplot, "value", "SalePrice")

# hanling test data's null data
for col in category_f:
    test[col].fillna("Unknown",inplace=True)
for col in num_f:
    test[col].fillna(train[col].mean(),inplace=True)

print("Skewnews :",train[target].skew())
print("Kurtosis :",train[target].kurt())

2. 수치형 데이터로 다중선형회귀 및 다중공선성 제거

import statsmodels.api as sm
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
train[num_f], train['SalePrice'], test_size=0.3, random_state=24)

x_data = X_train
target = y_train
x_data1 = sm.add_constant(x_data, has_constant="add")
x_data1

from sklearn.metrics import mean_squared_error

#modeling
multi_model = sm.OLS(target, x_data1)
fitted_multi_model= multi_model.fit()
x_test1 = sm.add_constant(X_test, has_constant="add")
print(mean_squared_error(y_test, fitted_multi_model.predict(x_test1)))
fitted_multi_model.summary()

# Check multicollinearity

from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(x_data1.values, i) 
for i in range(x_data1.shape[1])]
vif['features'] = x_data1.columns
# remove 
vif.sort_values(by='VIF Factor', ascending=False)

ex_columns = vif.sort_values(ascending=False, by="VIF Factor")[:8]['features'].values.tolist()

# data2 is data1's copy that exclude features whose vif factor is over 10
x_data2 = x_data1.copy().drop(columns=ex_columns,axis=1)
x_test2 = x_test1.copy().drop(columns=ex_columns,axis=1)
x_data2.columns

multi_model2 = sm.OLS(target, x_data2)
fitted_multi_model2 = multi_model2.fit()
print(mean_squared_error(y_test, fitted_multi_model2.predict(x_test2)))
fitted_multi_model2.summary()

▶ AIC가 24180 → 24230으로 개선됐다. 

# coef p_value > 0.05인 features 제거
ex_columns2 = ['LotFrontage','BsmtHalfBath','GarageYrBlt',
'OpenPorchSF','EnclosedPorch','3SsnPorch','PoolArea','MiscVal','MoSold','YrSold']
x_data3 = x_data2.copy().drop(columns=ex_columns2,axis=1)
x_test3 = x_test2.copy().drop(columns=ex_columns2,axis=1)

multi_model3 = sm.OLS(target, x_data3)
fitted_multi_model3 = multi_model3.fit()
print(mean_squared_error(y_test, fitted_multi_model3.predict(x_test3)))
fitted_multi_model3.summary()

sns.heatmap(x_data3.corr(),cmap='Blues')

x_data4 = x_data3.copy().drop(columns=['GarageArea','YearRemodAdd','TotRmsAbvGrd'],axis=1)
x_test4 = x_test3.copy().drop(columns=['GarageArea','YearRemodAdd','TotRmsAbvGrd'],axis=1)
multi_model4 = sm.OLS(target, x_data4)
fitted_multi_model4 = multi_model4.fit()
print(mean_squared_error(y_test, fitted_multi_model4.predict(x_test4)))
fitted_multi_model4.summary()

# train_valid_test_split
num_f= x_data4.columns.tolist()[1:]
features = num_f + category_f

from sklearn.model_selection import train_test_split

X_train, X_valtest, y_train, y_valtest = train_test_split(train[features],train['SalePrice'], test_size=0.4, random_state=0)
X_valid, X_test, y_valid, y_test = train_test_split(X_valtest, y_valtest, test_size=0.33, random_state=0)

print(X_train.shape, X_valid.shape, X_test.shape)
print(y_train.shape, y_valid.shape, y_test.shape)

#visualizaion of target
fig, axes = plt.subplots(1,3, figsize=(12,4))
datas = [y_train, y_valid, y_test]
for data, ax in zip(datas, axes):
    sns.kdeplot(data, label='{}'.format(data),ax=ax)

#visualization of features
for col in num_f:    
    datas = [X_train, X_valid, X_test]
    fig, axes = plt.subplots(1,3, figsize=(12,4))
    for data, ax in zip(datas, axes):
        sns.kdeplot(data[col], label='{}'.format(data),ax=ax)    
# ohehotencoding
df = pd.concat([X_train, X_valid, X_test],axis=0)
df = pd.get_dummies(df)

X_train_trans = df.iloc[0:X_train.shape[0]]
X_valid_trans = df.iloc[X_train.shape[0]:X_train.shape[0]+X_valid.shape[0]]
X_test_trans = df.iloc[X_train.shape[0]+X_valid.shape[0]:]


# polynomialfeatures
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2).fit(X_train_trans)
X_train_poly = poly.transform(X_train_trans)
X_valid_poly = poly.transform(X_valid_trans)
X_test_poly = poly.transform(X_test_trans)
print("X_train_poly.shape: ", X_train_poly.shape)
print("X_valid_poly.shape: ", X_valid_poly.shape)
print("X_test_poly.shape: ", X_test_poly.shape)

dd = pd.DataFrame(X_train_poly)
dd.columns=poly.get_feature_names()
dd
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train_trans, y_train)
lr_poly = LinearRegression()
lr_poly.fit(X_train_poly, y_train)
print("train score : {:.2f}".format(lr.score(X_train_trans, y_train)))
print("valid score : {:.2f}".format(lr.score(X_valid_trans, y_valid)))
print("poly train score : {:.2f}".format(lr_poly.score(X_train_poly, y_train)))
print("poly valid score : {:.2f}".format(lr_poly.score(X_valid_poly, y_valid)))
​

from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_trans, y_train)
svm_poly = SVC()
svm_poly.fit(X_train_poly, y_train)
print("train score : {:.2f}".format(svm.score(X_train_trans, y_train)))
print("valid score : {:.2f}".format(svm.score(X_valid_trans, y_valid)))
print("poly train score : {:.2f}".format(svm_poly.score(X_train_poly, y_train)))
print("poly valid score : {:.2f}".format(svm_poly.score(X_valid_poly, y_valid)))


from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train_trans, y_train)
rf_poly = RandomForestRegressor()
rf_poly.fit(X_train_poly, y_train)
print("train score : {:.2f}".format(rf.score(X_train_trans, y_train)))
print("valid score : {:.2f}".format(rf.score(X_valid_trans, y_valid)))
print("poly train score : {:.2f}".format(rf_poly.score(X_train_poly, y_train)))
print("poly valid score : {:.2f}".format(rf_poly.score(X_valid_poly, y_valid)))


from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

X_train2=X_train.copy()
X_valid2=X_valid.copy()
X_train2[category_f] = X_train2[category_f].apply(le.fit_transform)
X_valid2[category_f] = X_valid2[category_f].apply(le.fit_transform)

from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train2, y_train)
print("train score : {:.2f}".format(lr.score(X_train2, y_train)))
print("valid score : {:.2f}".format(lr.score(X_valid2, y_valid)))


from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train2, y_train)
print("train score : {:.2f}".format(rf.score(X_train2, y_train)))
print("valid score : {:.2f}".format(rf.score(X_valid2, y_valid)))

from sklearn.linear_model import Lasso

lasso = Lasso(alpha=10)
lasso.fit(X_train2, y_train)
print("train score : {:.2f}".format(lasso.score(X_train2, y_train)))
print("valid score : {:.2f}".format(lasso.score(X_valid2, y_valid)))

 

from sklearn.metrics import mean_squared_error
def MAPE(y_test, y_pred):
	return np.mean(np.abs((y_test - y_pred) / y_test)) * 100 
    
from sklearn.metrics import r2_score

mae = mean_squared_error(y_valid, rf.predict(X_valid_trans))
mape = MAPE(y_valid, rf.predict(X_valid_trans))
r2 = r2_score(y_valid, rf.predict(X_valid_trans))

print(mae, mape, r2)