BOTTLE6 2021. 3. 20. 13:16

1. library import 

import pandas as pd

import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')
from datetime import datetime

import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

from matplotlib.pyplot import rcParams
rcParams['figure.figsize'] = 10, 6

import itertools

2. load data

path = './AirPassengers/AirPassengers.csv'
data = pd.read_csv(path)
data = data.rename(columns={"Month":"month", "#Passengers":"passengers"})
data['month'] = pd.to_datetime(data['month'])
data = data.set_index('month')
data

3. Box-Jenkins Arima Procedure
- 3.1 Data Preprocessing
- 3.2 identify model to be tentatively entertained
- 3.3 estimate parameters
- 3.4 diagnosis check
- 3.5 use model to forecast

 

  • 3.1 Data Preprocessing
# Raw data plot 
fig = data.plot()

 

# Seasonal decomposition plot

# Observed = trend + seasonal + resid
decomposition = sm.tsa.seasonal_decompose(data['passengers'], model='additive', period=1)
fig = decomposition.plot()
fig.set_size_inches(10,10)
plt.show()

  • 3.2 identify model to be tentatively entertained 
## train:test = 8:2
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.2, shuffle=False, random_state=0)

# ACF, PACF Plot

fig, ax = plt.subplots(1,2, figsize=(10,5))
fig.suptitle("Raw Data")
sm.graphics.tsa.plot_acf(train_data.values.squeeze(), lags=30, ax=ax[0])
sm.graphics.tsa.plot_pacf(train_data.values.squeeze(), lags=30, ax=ax[1])

▶ 정상성 검토 : 평균과 분산이 일정하지 않으며, ACF 그래프가 천천히 떨어짐(정상성은 빠르게 떨어짐)

▶ 차분이 필요

# Differencing

diff_train_data = train_data.copy()
diff_train_data = diff_train_data['passengers'].diff()
diff_train_data = diff_train_data.dropna()
print("#### Raw Data ####")
print(train_data)
print("#### Differenced Data ####")
print(diff_train_data)

# differeneced data plot

plt.figure(figsize=(12,8))
plt.subplot(211)
plt.plot(train_data['passengers'])
plt.legend(['Raw Data (Non Stationary)'])
plt.subplot(212)
plt.plot(diff_train_data, 'orange')
plt.legend(['Differenced Data (Stationary)'])
plt.show()

# acf, pacf plot

fig, ax = plt.subplots(1,2, figsize=(10,5))
fig.suptitle("Differenced Data")
sm.graphics.tsa.plot_acf(diff_train_data.values.squeeze(), lags=30, ax=ax[0])
sm.graphics.tsa.plot_pacf(diff_train_data.values.squeeze(), lags=30, ax=ax[1]); #modify no to generate graph twice

# graphical method 
# acf : die out 
# pacf : AR(p) , p=1

 

  • 3.3 estimate parameters
model = ARIMA(train_data.values, order = (1,1,0))
model_fit = model.fit()
model_fit.summary()

 

  • 3.4 diagnosis check
import warnings
warnings.filterwarnings('ignore')

print("Examples of parameter combinations for Seasonal ARIMA")
p = range(0,3)
d = range(1,2)
q = range(0,3)
pdq = list(itertools.product(p,d,q))## 아주 훌륭한 도구

aic = []
for i in pdq:
    model = ARIMA(train_data.values, order=(i))
    model_fit = model.fit()
    print(f'ARIMA : {i} >> AIC : {round(model_fit.aic,2)}')
    aic.append(round(model_fit.aic, 2))

optimal = [(pdq[i], j) for i, j in enumerate(aic) if j==min(aic)]
optimal

model_opt = ARIMA(train_data.values, order = optimal[0][0])
model_opt_fit = model_opt.fit()
model_opt_fit.summary()

# use model to forecast

prediction = model_opt_fit.forecast(len(test_data))
predicted_value = prediction[0]
predicted_ub = prediction[2][:,0]
predicted_lb = prediction[2][:,1]
predict_index = list(test_data.index)

from sklearn.metrics import r2_score
r2 = r2_score(test_data, predicted_value)
r2

▶ 0.2244375434837036

fig, ax = plt.subplots(figsize=(12,6))
data.plot(ax=ax);
ax.vlines('1958-08-01', 0, 1000, linestyle='--', color='r', label='Start of Forecast');

ax.plot(predict_index, predicted_value, label='Prediction')
ax.fill_between(predict_index, predicted_lb, predicted_ub, color='k', alpha=0.1, label='0.95 prediction interval')

ax.legend(loc='upper left')
plt.suptitle(f'ARIMA {optimal[0][0]} Prediction Results (r2_score : {round(r2,2)})')
plt.show()

fig, ax = plt.subplots(figsize=(12,6))
ax.plot(predict_index, predicted_value, linestyle='--', label='Prediction', color='gray')
ax.plot(data[predict_index[0]:].index, data[predict_index[0]:], label='passengers',color='blue')
ax.fill_between(predict_index, predicted_lb, predicted_ub, color='k', alpha=0.1, label='0.95 prediction interval')
plt.legend(loc='upper left')

# SARIMA
# parameter search

print("Examples of parameter combinations for Seasonal ARIMA")
p = range(0,3)
d = range(1,2)
q = range(0,3)
pdq = list(itertools.product(p,d,q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p,d,q))]

aic = []
params = []
for i in pdq:
    for j in seasonal_pdq:
        try:
            model = SARIMAX(train_data.values, order = (i), seasonal_order = (j))
            model_fit = model.fit()
            print(f'SARIMA : {i}{j} >> AIC : {round(model_fit.aic,2)}')
            aic.append(round(model_fit.aic, 2))
            params.append((i,j))
        except:
            pass            
# search optimal parameters

optimal = [(params[i], j) for i, j in enumerate(aic) if j==min(aic)]
#optimal

optimal = [(((1,1,0),(1,1,2,12)),751.149)] # optimal 오류로 직접 선택

model_opt = SARIMAX(train_data.values, order = optimal[0][0][0], seasonal_order = optimal[0][0][1])
model_opt_fit = model_opt.fit()
model_opt_fit.summary()

 

  • 3.5 use model to forecast
prediction = model_opt_fit.get_forecast(len(test_data))
predicted_value = prediction.predicted_mean
predicted_ub = prediction.conf_int()[:,0]
predicted_lb = prediction.conf_int()[:,1]
predict_index = list(test_data.index)
r2 = r2_score(test_data, predicted_value)
fig, ax = plt.subplots(figsize=(12,6))
data.plot(ax=ax)
ax.vlines('1958-08-01', 0, 700, linestyle='--', color='r', label='Start of Forecast');
ax.plot(predict_index, predicted_value, label='Prediction')
ax.fill_between(predict_index, predicted_lb, predicted_ub, color='k', alpha=0.1, label='0.95 Prediction Interval')
ax.legend(loc='upper left')
plt.suptitle(f'SARIMA {optimal[0][0][0]}, {optimal[0][0][1]} Prediction Results (r2_score: {round(r2,2)})')
plt.show()

 

 

fig, ax = plt.subplots(figsize=(12,6))
ax.plot(predict_index, predicted_value, linestyle='--', label='Prediction', color='gray')
ax.plot(data[predict_index[0]:].index, data[predict_index[0]:], label='passengers',color='blue')
ax.fill_between(predict_index, predicted_lb, predicted_ub, color='k', alpha=0.1, label='0.95 prediction interval')
plt.legend(loc='upper left')

 

출처 : www.youtube.com/watch?v=rdR2fNDq6v0&list=PLpIPLT0Pf7IqSuMx237SHRdLd5ZA4AQwd&index=11