STUDY/ADP, 빅데이터분석기사
[arima] smp
BOTTLE6
2021. 3. 21. 18:12
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')
from datetime import datetime
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
import itertools
path= './smp/smp.xlsx'
df = pd.read_excel(path, header=1)
df.head(3)
df =df.rename(columns = {'Unnamed: 0':'ym','육지':'smp'})
df = df.drop(['제주','통합','Unnamed: 4'],axis=1)
df['ym'] = pd.to_datetime(df.ym, format='%Y-%m')
df = df.set_index('ym')
df = df[df.smp>0]
df = df.reset_index().sort_values(by='ym', ascending=True).set_index("ym")
# raw data fig
fig = df.plot()
plt.title("SMP")
# Seasonal Decomposition
# observed = trend + seasonal + resid
# period(=freq) : seosonal을 볼때 주기
decomposition = sm.tsa.seasonal_decompose(df['smp'], model='additive', freq=12)
fig = decomposition.plot()
fig.set_size_inches(10,8)
plt.show()
# train_test_split
from sklearn.model_selection import train_test_split
#shuffle = False ▶ 시간순서대로..
train, test = train_test_split(df, test_size=0.2, shuffle=False, random_state=24)
train.shape, test.shape
# ACF, PACF plot
fig, ax = plt.subplots(1,2, figsize=(10,5))
fig.suptitle("RAW SMP")
sm.graphics.tsa.plot_acf(train.values.squeeze(), lags=30, ax=ax[0])
sm.graphics.tsa.plot_pacf(train.values.squeeze(), lags=30, ax=ax[1]);
#Differencing
diff_train = train.copy()
diff_train = diff_train.diff()
diff_train = diff_train.dropna()
print("## Raw Data ##")
print(train)
print("## Differenced Data")
print(diff_train)
# Differenced data plot
plt.figure(figsize=(12,8))
plt.subplot(211)
plt.plot(train)
plt.legend(["Raw SMP (Non Stationary)"])
plt.subplot(212)
plt.plot(diff_train,'orange')
plt.legend(["Differenced SMP (Stationary)"])
# acf, pacr plot
fig, ax = plt.subplots(1,2, figsize=(10,5))
fig.suptitle("Differenced SMP")
sm.graphics.tsa.plot_acf(diff_train.values.squeeze(), lags=20, ax=ax[0])
sm.graphics.tsa.plot_pacf(diff_train.values.squeeze(), lags=20, ax=ax[1]);
# MA(1)
model1 = ARIMA(train.values, order = (0,1,1))
model1_fit = model1.fit()
model1_fit.summary()
# AR(1)
model2 = ARIMA(train.values, order = (1,1,0))
model2_fit = model2.fit()
model2_fit.summary()
print("Examples of parameter combinations for seosonal ARIMA")
p=range(0,3)
d=range(1,2)
q=range(0,3)
pdq = list(itertools.product(p,d,q))
aic = []
for i in pdq:
model=ARIMA(train.values, order=(i))
model_fit = model.fit()
print(f"ARIMA : {i} >> AIC : {round(model_fit.aic,2)}")
aic.append(round(model_fit.aic,2))
optimal = [pdq[np.argmin(aic)], min(aic)]
optimal
model_opt = ARIMA(train.values, order = optimal[0])
model_opt_fit = model_opt.fit()
model_opt_fit.summary()
# use model to forecast
prediction = model_opt_fit.forecast(len(test))
predicted_value = prediction[0]
predicted_lb = prediction[2][:,0]
predicted_ub = prediction[2][:,1]
predict_index = list(test.index)
from sklearn.metrics import r2_score
r2 = r2_score(test, predicted_value)
r2 # -1.6689735655861928
fig, ax = plt.subplots(figsize=(12,6))
sns.lineplot(data=df, x=df.index, y='smp',ax=ax)
ax.vlines(datetime(2018,12,1),0,200, linestyle='--', color='r', label='Start of ForeCast')
ax.plot(predict_index, predicted_value, color='orange', label='Prediction')
plt.legend(loc='upper right')
plt.suptitle(f"ARIMA {optimal[0][0]} Prediction Results (r2 score : {round(r2,2)})")
fig, ax = plt.subplots(figsize=(12,6))
ax.plot(predict_index, predicted_value, linestyle='--', label='Prediction',color='gray')
ax.plot(df[predict_index[0]:].index, df[predict_index[0]:], label='smp', color='blue')
ax.fill_between(predict_index, predicted_lb, predicted_ub, color='k', alpha=0.1, label='0.95 prediction interval')
# SARIMA
# parameter search
print("Examples of parameter combinataions for Seasonal ARIMA")
p = range(0,3)
d = range(1,2)
q = range(0,3)
pdq = list(itertools.product(p,d,q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p,d,q))]
aic = []
params = []
for i in pdq:
for j in seasonal_pdq:
try:
model = SARIMAX(train.values, order = (i), seasoanl_order = (j))
model_fit = model.fit()
print(f"SARIMA : {i}{j} >> AIC {round(model_fit.aic,2)}")
aic.append(round(model_fit.aic,2))
params.append((i,j))
except:
pass
optimal = [params[np.argmin(aic)], min(aic)]
optimal
model_opt = SARIMAX(train.values, order=optimal[0][0], seasonal_order = optimal[0][1])
model_opt_fit = model_opt.fit()
model_opt_fit.summary()
# use model to forecast
prediction = model_opt_fit.get_forecast(len(test))
predicted_value = prediction.predicted_mean
predicted_lb = prediction.conf_int()[:,0]
predicted_ub = prediction.conf_int()[:,1]
predict_index = list(test.index)
r2 = r2_score(test, predicted_value)
r2 #-13.29835713570155
fig, ax = plt.subplots(figsize=(12,6))
sns.lineplot(data=df, x=df.index, y='smp', label='smp')
ax.vlines(datetime(2018,12,1), 0, 200, linestyle='--', color='r', label='Start of Forecast')
ax.plot(predict_index, predicted_value, label='Prediction')
ax.fill_between(predict_index, predicted_lb, predicted_ub, color='k', alpha=0.1, label='0.95 prediction interval')
ax.legend(loc='upper left')
plt.show()
fig, ax = plt.subplots(figsize=(12,6))
ax.plot(predict_index, predicted_value, linestyle='--', label='Prediction', color='gray')
ax.plot(df[predict_index[0]:].index, df[predict_index[0]:], label='smp', color='blue')
ax.fill_between(predict_index, predicted_lb, predicted_ub, color='k', alpha=0.1, label='0.95 prediction interval')
plt.suptitle(f"SARIMA {optimal[0]} Prediction Results (r2 score : {round(r2,2)})")
plt.legend(loc='upper left')