STUDY/ADP, 빅데이터분석기사
[ARIMA] airplane
BOTTLE6
2021. 3. 20. 13:16
1. library import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')
from datetime import datetime
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from matplotlib.pyplot import rcParams
rcParams['figure.figsize'] = 10, 6
import itertools
2. load data
path = './AirPassengers/AirPassengers.csv'
data = pd.read_csv(path)
data = data.rename(columns={"Month":"month", "#Passengers":"passengers"})
data['month'] = pd.to_datetime(data['month'])
data = data.set_index('month')
data
3. Box-Jenkins Arima Procedure
- 3.1 Data Preprocessing
- 3.2 identify model to be tentatively entertained
- 3.3 estimate parameters
- 3.4 diagnosis check
- 3.5 use model to forecast
- 3.1 Data Preprocessing
# Raw data plot
fig = data.plot()
# Seasonal decomposition plot
# Observed = trend + seasonal + resid
decomposition = sm.tsa.seasonal_decompose(data['passengers'], model='additive', period=1)
fig = decomposition.plot()
fig.set_size_inches(10,10)
plt.show()
- 3.2 identify model to be tentatively entertained
## train:test = 8:2
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.2, shuffle=False, random_state=0)
# ACF, PACF Plot
fig, ax = plt.subplots(1,2, figsize=(10,5))
fig.suptitle("Raw Data")
sm.graphics.tsa.plot_acf(train_data.values.squeeze(), lags=30, ax=ax[0])
sm.graphics.tsa.plot_pacf(train_data.values.squeeze(), lags=30, ax=ax[1])
▶ 정상성 검토 : 평균과 분산이 일정하지 않으며, ACF 그래프가 천천히 떨어짐(정상성은 빠르게 떨어짐)
▶ 차분이 필요
# Differencing
diff_train_data = train_data.copy()
diff_train_data = diff_train_data['passengers'].diff()
diff_train_data = diff_train_data.dropna()
print("#### Raw Data ####")
print(train_data)
print("#### Differenced Data ####")
print(diff_train_data)
# differeneced data plot
plt.figure(figsize=(12,8))
plt.subplot(211)
plt.plot(train_data['passengers'])
plt.legend(['Raw Data (Non Stationary)'])
plt.subplot(212)
plt.plot(diff_train_data, 'orange')
plt.legend(['Differenced Data (Stationary)'])
plt.show()
# acf, pacf plot
fig, ax = plt.subplots(1,2, figsize=(10,5))
fig.suptitle("Differenced Data")
sm.graphics.tsa.plot_acf(diff_train_data.values.squeeze(), lags=30, ax=ax[0])
sm.graphics.tsa.plot_pacf(diff_train_data.values.squeeze(), lags=30, ax=ax[1]); #modify no to generate graph twice
# graphical method
# acf : die out
# pacf : AR(p) , p=1
- 3.3 estimate parameters
model = ARIMA(train_data.values, order = (1,1,0))
model_fit = model.fit()
model_fit.summary()
- 3.4 diagnosis check
import warnings
warnings.filterwarnings('ignore')
print("Examples of parameter combinations for Seasonal ARIMA")
p = range(0,3)
d = range(1,2)
q = range(0,3)
pdq = list(itertools.product(p,d,q))## 아주 훌륭한 도구
aic = []
for i in pdq:
model = ARIMA(train_data.values, order=(i))
model_fit = model.fit()
print(f'ARIMA : {i} >> AIC : {round(model_fit.aic,2)}')
aic.append(round(model_fit.aic, 2))
optimal = [(pdq[i], j) for i, j in enumerate(aic) if j==min(aic)]
optimal
model_opt = ARIMA(train_data.values, order = optimal[0][0])
model_opt_fit = model_opt.fit()
model_opt_fit.summary()
# use model to forecast
prediction = model_opt_fit.forecast(len(test_data))
predicted_value = prediction[0]
predicted_ub = prediction[2][:,0]
predicted_lb = prediction[2][:,1]
predict_index = list(test_data.index)
from sklearn.metrics import r2_score
r2 = r2_score(test_data, predicted_value)
r2
▶ 0.2244375434837036
fig, ax = plt.subplots(figsize=(12,6))
data.plot(ax=ax);
ax.vlines('1958-08-01', 0, 1000, linestyle='--', color='r', label='Start of Forecast');
ax.plot(predict_index, predicted_value, label='Prediction')
ax.fill_between(predict_index, predicted_lb, predicted_ub, color='k', alpha=0.1, label='0.95 prediction interval')
ax.legend(loc='upper left')
plt.suptitle(f'ARIMA {optimal[0][0]} Prediction Results (r2_score : {round(r2,2)})')
plt.show()
fig, ax = plt.subplots(figsize=(12,6))
ax.plot(predict_index, predicted_value, linestyle='--', label='Prediction', color='gray')
ax.plot(data[predict_index[0]:].index, data[predict_index[0]:], label='passengers',color='blue')
ax.fill_between(predict_index, predicted_lb, predicted_ub, color='k', alpha=0.1, label='0.95 prediction interval')
plt.legend(loc='upper left')
# SARIMA
# parameter search
print("Examples of parameter combinations for Seasonal ARIMA")
p = range(0,3)
d = range(1,2)
q = range(0,3)
pdq = list(itertools.product(p,d,q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p,d,q))]
aic = []
params = []
for i in pdq:
for j in seasonal_pdq:
try:
model = SARIMAX(train_data.values, order = (i), seasonal_order = (j))
model_fit = model.fit()
print(f'SARIMA : {i}{j} >> AIC : {round(model_fit.aic,2)}')
aic.append(round(model_fit.aic, 2))
params.append((i,j))
except:
pass
# search optimal parameters
optimal = [(params[i], j) for i, j in enumerate(aic) if j==min(aic)]
#optimal
optimal = [(((1,1,0),(1,1,2,12)),751.149)] # optimal 오류로 직접 선택
model_opt = SARIMAX(train_data.values, order = optimal[0][0][0], seasonal_order = optimal[0][0][1])
model_opt_fit = model_opt.fit()
model_opt_fit.summary()
- 3.5 use model to forecast
prediction = model_opt_fit.get_forecast(len(test_data))
predicted_value = prediction.predicted_mean
predicted_ub = prediction.conf_int()[:,0]
predicted_lb = prediction.conf_int()[:,1]
predict_index = list(test_data.index)
r2 = r2_score(test_data, predicted_value)
fig, ax = plt.subplots(figsize=(12,6))
data.plot(ax=ax)
ax.vlines('1958-08-01', 0, 700, linestyle='--', color='r', label='Start of Forecast');
ax.plot(predict_index, predicted_value, label='Prediction')
ax.fill_between(predict_index, predicted_lb, predicted_ub, color='k', alpha=0.1, label='0.95 Prediction Interval')
ax.legend(loc='upper left')
plt.suptitle(f'SARIMA {optimal[0][0][0]}, {optimal[0][0][1]} Prediction Results (r2_score: {round(r2,2)})')
plt.show()
fig, ax = plt.subplots(figsize=(12,6))
ax.plot(predict_index, predicted_value, linestyle='--', label='Prediction', color='gray')
ax.plot(data[predict_index[0]:].index, data[predict_index[0]:], label='passengers',color='blue')
ax.fill_between(predict_index, predicted_lb, predicted_ub, color='k', alpha=0.1, label='0.95 prediction interval')
plt.legend(loc='upper left')
출처 : www.youtube.com/watch?v=rdR2fNDq6v0&list=PLpIPLT0Pf7IqSuMx237SHRdLd5ZA4AQwd&index=11