library(tidyverse)
library(svglite)
svglite::svglite('output.svg')
── Attaching packages ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ── ✔ ggplot2 3.4.0 ✔ purrr 0.3.4 ✔ tibble 3.1.8 ✔ dplyr 1.0.10 ✔ tidyr 1.2.1 ✔ stringr 1.4.1 ✔ readr 2.1.3 ✔ forcats 0.5.2 Warning message: "程辑包'ggplot2'是用R版本4.2.2 来建造的" ── Conflicts ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ── ✖ dplyr::filter() masks stats::filter() ✖ dplyr::lag() masks stats::lag() Warning message: "程辑包'svglite'是用R版本4.2.2 来建造的"
df <- bind_rows(lapply(list.files( 'C:\\Users\\asheo\\Desktop', pattern = 'csv',
full.names = TRUE), read.csv, stringsAsFactors = FALSE)) %>%
mutate(日期 = as.Date(gsub('年|月|日', '-', 日期))) %>%
distinct() %>%
arrange(日期)
DT::datatable(df)
日期 | 天气状况 | 最高温度 | 最低温度 | 白天风力风向 | 夜晚风力风向 | X | X.1 | X.2 |
---|---|---|---|---|---|---|---|---|
<date> | <chr> | <chr> | <chr> | <chr> | <chr> | <lgl> | <lgl> | <chr> |
2018-01-01 | 多云/多云 | 22℃ | 12℃ | 无持续风向<3级 | 无持续风向<3级 | NA | NA | 22℃ |
2018-01-02 | 多云/多云 | 22℃ | 15℃ | 无持续风向<3级 | 无持续风向<3级 | NA | NA | 22℃ |
2018-01-03 | 多云/阴 | 23℃ | 15℃ | 无持续风向<3级 | 无持续风向<3级 | NA | NA | 23℃ |
2018-01-04 | 多云/小雨 | 21℃ | 16℃ | 无持续风向<3级 | 无持续风向<3级 | NA | NA | 21℃ |
2018-01-05 | 阴/小雨 | 19℃ | 13℃ | 无持续风向<3级 | 无持续风向<3级 | NA | NA | 19℃ |
2018-01-06 | 小雨-中雨/中雨-大雨 | 15℃ | 11℃ | 无持续风向<3级 | 无持续风向<3级 | NA | NA | 15℃ |
2018-01-07 | 大雨/中雨 | 15℃ | 7℃ | 无持续风向<3级 | 北风4~5级 | NA | NA | 15℃ |
2018-01-08 | 中雨/小雨-中雨 | 12℃ | 5℃ | 北风4~5级 | 北风3~4级 | NA | NA | 12℃ |
2018-01-09 | 小雨/阴 | 9℃ | 6℃ | 无持续风向<3级 | 无持续风向<3级 | NA | NA | 9℃ |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
2021-08-23 | 多云/多云 | 36℃ | 27℃ | 北风1-2级 | 北风1-2级 | NA | NA | 36℃ |
2021-08-24 | 多云/多云 | 36℃ | 27℃ | 北风1-2级 | 北风1-2级 | NA | NA | 36℃ |
2021-08-25 | 雷阵雨/多云 | 33℃ | 26℃ | 北风1-2级 | 北风1-2级 | NA | NA | 33℃ |
2021-08-26 | 雷阵雨/雷阵雨 | 35℃ | 27℃ | 北风1-2级 | 北风1-2级 | NA | NA | 35℃ |
2021-08-27 | 雷阵雨/雷阵雨 | 35℃ | 26℃ | 北风1-2级 | 北风1-2级 | NA | NA | 35℃ |
2021-08-28 | 雷阵雨/多云 | 33℃ | 26℃ | 北风1-2级 | 北风1-2级 | NA | NA | 33℃ |
2021-08-29 | 雷阵雨/雷阵雨 | 32℃ | 25℃ | 北风1-2级 | 北风1-2级 | NA | NA | 32℃ |
2021-08-30 | 阵雨/阵雨 | 34℃ | 26℃ | 北风1-2级 | 北风1-2级 | NA | NA | 34℃ |
2021-08-31 | 雷阵雨/阵雨 | 32℃ | 26℃ | 北风1-2级 | 北风1-2级 | NA | NA | 32℃ |
if (all(!is.na(df$日期))) {
print('序列不含缺失值')
}
if (all(!is.na(df$最低温度))) {
print('序列不含缺失值')
}
if (all(!is.na(df$最高温度))) {
print('序列不含缺失值')
}
[1] "序列不含缺失值" [1] "序列不含缺失值" [1] "序列不含缺失值"
df <- df %>%
mutate(across(c(最高温度, 最低温度), ~as.numeric(str_remove(., '℃'))))
maxTemp <- df$最高温度
minTemp <- df$最低温度
tibble(maxTemp, minTemp)
maxTemp | minTemp |
---|---|
<dbl> | <dbl> |
22 | 12 |
22 | 15 |
23 | 15 |
21 | 16 |
19 | 13 |
15 | 11 |
15 | 7 |
⋮ | ⋮ |
33 | 26 |
35 | 27 |
35 | 26 |
33 | 26 |
32 | 25 |
34 | 26 |
32 | 26 |
plotSeries <- function(date, values, plotName, yName = 'daily temp'){
Series <- tibble(date = as.Date(date), Temp = values)
ggplot(Series, aes(x = date, y = Temp)) +
geom_line(color = 'blue', linetype = 'dashed') +
geom_point(color = 'red', shape = 21, fill = 'white') +
geom_smooth(method = 'gam', color = 'green', linetype = 'dotted') +
labs(title = plotName, x = 'Date', y = yName, color = 'Line Type', shape = 'Point Type') +
scale_color_manual(values = c('blue', 'green'), labels = c('Dashed Line', 'Dotted Line')) +
scale_shape_manual(values = c(21, 22), labels = c('Solid Point', 'Hollow Point')) +
theme_bw() +
theme(
plot.title = element_text(size = rel(2)),
axis.title.x = element_text(size = rel(1.2)),
axis.title.y = element_text(size = rel(1.2)),
axis.text.x = element_text(angle = -45, hjust = 0),
legend.position = 'bottom',
legend.title.align = 0.5,
legend.text.align = -0.5
)
}
plotSeries(df$日期, maxTemp, 'Daily Maximum Temperature')
ggsave('SARIMA_1.svg', device = 'svg')
日最低温度显然具有季节性,且周期为1年,先增大后减小,表现为二次趋势。
plotSeries(df$日期, minTemp, 'Daily Minimum Temperature')
ggsave('SARIMA_2.svg', device = 'svg')
日最低温度显然具有季节性,且周期为1年;每年的日最低温度变化趋势同日最高温度,先增大后减小,表现为二次趋势。
plotSeries(df$日期, maxTemp-minTemp, 'Daily Temperature Range', 'temp range')
ggsave('SARIMA_3.svg', device = 'svg')
看起来日温差波动趋势不显著,变化较平滑、平缓。
library(forecast)
Warning message: "程辑包'forecast'是用R版本4.2.2 来建造的" Registered S3 method overwritten by 'quantmod': method from as.zoo.data.frame zoo
ACFandPACF <- function(x, n1, n2) {
acf_values <- forecast::Acf(x, plot=FALSE)
pacf_values <- forecast::Pacf(x, plot=FALSE)
acf_df <- tibble(lag = 1:length(acf_values$acf), acf = acf_values$acf)
pacf_df <- tibble(lag = 1:length(pacf_values$acf), pacf = pacf_values$acf)
gg_acf <- acf_df %>%
ggplot(aes(x = lag, y = acf)) +
geom_hline(yintercept = 0, color = 'gray50') +
geom_segment(aes(xend = lag, yend = 0), color = 'gray50') +
geom_bar(stat = 'identity', fill = '#0072B2', alpha = 0.8) +
labs(title = 'Autocorrelation Function (ACF)', x = 'Lag', y = 'ACF') +
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = 'bold'),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
gg_pacf <- pacf_df %>%
ggplot(aes(x = lag, y = pacf)) +
geom_hline(yintercept = 0, color = 'gray50') +
geom_segment(aes(xend = lag, yend = 0), color = 'gray50') +
geom_bar(stat = 'identity', fill = '#E69F00', alpha = 0.8) +
labs(title = 'Partial Autocorrelation Function (PACF)', x = 'Lag', y = 'PACF') +
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = 'bold'),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
print(gg_acf)
ggsave(n1, device = 'svg')
print(gg_pacf)
ggsave(n2, device = 'svg')
}
ACFandPACF(maxTemp, 'SARIMA_4.svg', 'SARIMA_5.svg')
日最高温度序列的ACF显然无截尾性,PACF有待进一步分析:不会是MA序列。
ACFandPACF(minTemp, 'SARIMA_6.svg', 'SARIMA_7.svg')
日最低温度序列的ACF显然无截尾性,PACF有待进一步分析:不会是MA序列。
ACFandPACF(maxTemp-minTemp, 'SARIMA_8.svg', 'SARIMA_9.svg')
同上,日温差序列有可能是ARMA序列。
# 单位根过程检验
library(fUnitRoots)
adfTest(minTemp); cat('=================================')
adfTest(maxTemp); cat('=================================')
adfTest(maxTemp-minTemp)
Warning message: "程辑包'fUnitRoots'是用R版本4.2.3 来建造的"
Title: Augmented Dickey-Fuller Test Test Results: PARAMETER: Lag Order: 1 STATISTIC: Dickey-Fuller: -1.417 P VALUE: 0.1647 Description: Sun May 28 03:52:33 2023 by user: asheo
=================================
Title: Augmented Dickey-Fuller Test Test Results: PARAMETER: Lag Order: 1 STATISTIC: Dickey-Fuller: -1.3691 P VALUE: 0.18 Description: Sun May 28 03:52:33 2023 by user: asheo
=================================
Warning message in adfTest(maxTemp - minTemp): "p-value smaller than printed p-value"
Title: Augmented Dickey-Fuller Test Test Results: PARAMETER: Lag Order: 1 STATISTIC: Dickey-Fuller: -3.5743 P VALUE: 0.01 Description: Sun May 28 03:52:33 2023 by user: asheo
结论:不能拒绝日最高温度与最低温度不平稳的原假设,拒绝日温差不平稳的原假设,认为日温差是平稳的。
# KPSS趋势检验
library(tseries)
kpss.test(minTemp); cat('=================================')
kpss.test(maxTemp); cat('=================================')
kpss.test(maxTemp-minTemp)
KPSS Test for Level Stationarity data: minTemp KPSS Level = 0.51984, Truncation lag parameter = 7, p-value = 0.0372
=================================
KPSS Test for Level Stationarity data: maxTemp KPSS Level = 0.70795, Truncation lag parameter = 7, p-value = 0.01282
=================================
Warning message in kpss.test(maxTemp - minTemp): "p-value smaller than printed p-value"
KPSS Test for Level Stationarity data: maxTemp - minTemp KPSS Level = 0.78308, Truncation lag parameter = 7, p-value = 0.01
结论:不能拒绝日最高温度与日最低温度序列趋势显著的原假设;认为日温差没有趋势。
# PP单位根检验
pp.test(minTemp); cat('=================================')
pp.test(maxTemp); cat('=================================')
pp.test(maxTemp-minTemp)
Warning message in pp.test(minTemp): "p-value smaller than printed p-value"
Phillips-Perron Unit Root Test data: minTemp Dickey-Fuller Z(alpha) = -39.386, Truncation lag parameter = 7, p-value = 0.01 alternative hypothesis: stationary
=================================
Warning message in pp.test(maxTemp): "p-value smaller than printed p-value"
Phillips-Perron Unit Root Test data: maxTemp Dickey-Fuller Z(alpha) = -72.077, Truncation lag parameter = 7, p-value = 0.01 alternative hypothesis: stationary
=================================
Warning message in pp.test(maxTemp - minTemp): "p-value smaller than printed p-value"
Phillips-Perron Unit Root Test data: maxTemp - minTemp Dickey-Fuller Z(alpha) = -800.69, Truncation lag parameter = 7, p-value = 0.01 alternative hypothesis: stationary
结论:三个数列均不平稳。
# Ljung-Box白噪声检验
Box.test(minTemp, type='Ljung-Box'); cat('=================================')
Box.test(maxTemp, type='Ljung-Box'); cat('=================================')
Box.test(maxTemp-minTemp, type='Ljung-Box')
Box-Ljung test data: minTemp X-squared = 1235.9, df = 1, p-value < 2.2e-16
=================================
Box-Ljung test data: maxTemp X-squared = 1151, df = 1, p-value < 2.2e-16
=================================
Box-Ljung test data: maxTemp - minTemp X-squared = 330.17, df = 1, p-value < 2.2e-16
结论:三个数据均非白噪声,可以进行时间序列分析。
数据具有明显的周期性,base-R原生的分解方法过于简单,令人不敢恭维。
针对传统的SARIMA模型,后继提出了X11分解法、SEATS分解法与以及STL分解法等多种方法,在R的包中均有实现。下介绍该包的一些可乘性方法:
X11:能够补全首位数值,季节因子也能够随着时间变化而发生缓慢变化,能够处理节假日等突发事件,既能够用可加模式也能够用可乘模式,分解过程全自动化而且非常稳健不怕离群值。该方法在seasonal
包中由seas()
函数有实现,需要指定参数x11=""
。
SEATS:只能够对季节尺度和月尺度的时间序列进行处理,与X11差别不大,对seas()
使用默认参数即可。
STL:对季节因子和趋势因子都进行了局部多项式回归(非参数统计方法)的算法,stl()
函数,一般而言这是最优解,因为:
SEATS和X11只能解决季节和月尺度的时间序列,STL没有任何限制;
季节因子可以随时间变化而变化,变化速率可以用户自定义;
趋势因子可以用户自定义;
对离群值不敏感,不过这样可能会让局部残差变大。
也有缺点,如只能进行可加性分解,无法根据交易日进行调整。不过如果要用可乘性的分解,其实可以先将序列进行对数化运算,然后再做STL,最后再反推回去即可。
stlf()
、mstl()
函数。和stl()
一样隶属forecast
包。library(seasonal)
library(lubridate)
library(scales)
maxT <- ts(maxTemp[1:round(length(maxTemp)*0.9)], start = df$日期[1], frequency = 365)
minT <- ts(minTemp[1:round(length(maxTemp)*0.9)], start = df$日期[1], frequency = 365)
Warning message: "程辑包'seasonal'是用R版本4.2.3 来建造的" 载入程辑包:'seasonal' The following object is masked from 'package:tibble': view 载入程辑包:'lubridate' The following objects are masked from 'package:base': date, intersect, setdiff, union 载入程辑包:'scales' The following object is masked from 'package:purrr': discard The following object is masked from 'package:readr': col_factor
summary(maxT)
Min. 1st Qu. Median Mean 3rd Qu. Max. 7 23 28 27 32 38
forec_1 <- stlf(maxT, method='naive')
plotARIMA <- function(data) {
autoplot(data, facets = FALSE) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = 'bold', hjust = 0.5, margin = margin(b = 20)),
plot.subtitle = element_text(size = 14, hjust = 0.5, margin = margin(b = 10)),
axis.title.x = element_text(size = 12, margin = margin(t = 10)),
axis.title.y = element_text(size = 12, margin = margin(r = 10)),
axis.text = element_text(size = 10),
legend.title = element_blank(),
legend.text = element_text(size = 10),
legend.position = 'bottom',
legend.key.size = unit(0.8, 'cm'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_rect(fill = 'white')
) +
ggtitle('Temperature Forecast') +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))
}
plotARIMA(forec_1)
ggsave('ARIMA_10.svg', device = 'svg')
maxTts <- maxT %>% mstl(robust=TRUE)
autoplot(maxTts) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = 'bold', hjust = 0.5, margin = margin(b = 20)),
plot.subtitle = element_text(size = 14, hjust = 0.5, margin = margin(b = 10)),
axis.title.x = element_text(size = 12, margin = margin(t = 10)),
axis.title.y = element_text(size = 12, margin = margin(r = 10)),
axis.text = element_text(size = 10),
legend.title = element_blank(),
legend.text = element_text(size = 10),
legend.position = 'bottom',
legend.key.size = unit(0.8, 'cm'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_rect(fill = 'white'),
) +
ggtitle('Max Temperature Time Series')
ggsave('ARIMA_11.svg', device = 'svg')
summary(maxTts)
Data Trend Seasonal365 Remainder Min. : 7 Min. :27.12 Min. :-18.18260 Min. :-14.71321 1st Qu.:23 1st Qu.:27.44 1st Qu.: -4.54282 1st Qu.: -1.52788 Median :28 Median :27.91 Median : -0.05354 Median : -0.06665 Mean :27 Mean :27.77 Mean : -0.43365 Mean : -0.33344 3rd Qu.:32 3rd Qu.:28.07 3rd Qu.: 4.65950 3rd Qu.: 1.21704 Max. :38 Max. :28.25 Max. : 8.33335 Max. : 14.83017
forec_2 <- forecast(maxTts, h=round(length(maxT)*0.4))
plotARIMA(forec_2)
ggsave('ARIMA_12.svg', device = 'svg')
pred_ <- data.frame(
subset(tibble(rownames_to_column(data.frame(forec_2), var='Time'))[1:round(length(maxT)*0.1),], select = -Time),
Time = as.POSIXct(df$日期[round(length(maxT)*0.9+1):length(maxT)]),
temp = maxT[round(length(maxT)*0.9+1):length(maxT)]
)
ggplot(data = pred_, aes(x = Time)) +
geom_line(aes(y = Point.Forecast), color = '#0072B2', linewidth = 1.5, group = 1) +
geom_ribbon(aes(ymin = Lo.80, ymax = Hi.80, group = 1), fill = '#E69F00', alpha = 0.4) +
geom_point(aes(y = temp), color = '#D55E00', size = 3, shape = 21, fill = '#D55E00') +
labs(x = 'Time', y = 'Temperature') +
scale_x_datetime(labels = date_format('%b %Y'), breaks = date_breaks('1 month')) +
scale_y_continuous(limits = c(min(pred_$Lo.80), max(pred_$Hi.80)), expand = c(0, 0.05)) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = 'bold', hjust = 0.5, margin = margin(b = 20)),
plot.subtitle = element_text(size = 14, hjust = 0.5, margin = margin(b = 10)),
axis.title.x = element_text(size = 12, margin = margin(t = 10)),
axis.title.y = element_text(size = 12, margin = margin(r = 10)),
axis.text = element_text(size = 10),
legend.title = element_blank(),
legend.text = element_text(size = 10),
legend.position = 'bottom',
legend.key.size = unit(0.8, 'cm'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_rect(fill = 'white')
) +
annotate('text', x = max(pred_$Time), y = max(pred_$Hi.80),
label = 'Confidence Interval', hjust = 1, vjust = -0.5, size = 4, color = '#E69F00') +
annotate('text', x = max(pred_$Time), y = min(pred_$Lo.80),
label = 'Confidence Interval', hjust = 1, vjust = 2, size = 4, color = '#E69F00') +
ggtitle('Temperature Prediction') +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))
ggsave('ARIMA_13.svg', device = 'svg')
以MSE作为评估准则,残差可视化
MSEseries <- data.frame(
Time = pred_$Time,
Value = (pred_$Point.Forecast - pred_$temp)^2/length(pred_$temp)
)
MSEseries$Time <- as.Date(data$Time)
ggplot(MSEseries, aes(x = Time, y = Value)) +
geom_line(color = '#00CED1', size = 1.5) +
geom_point(color = '#0000CD') +
scale_x_date(labels = date_format('%b %Y'), breaks = date_breaks('1 month')) +
labs(x = 'Date', y = 'MSE', title = 'Error Visualization') +
theme_minimal() +
theme(
plot.title = element_text(size = 22, face = 'bold'),
axis.text.x = element_text(size = 9, angle = 45, hjust = 1),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 17),
axis.title.y = element_text(size = 17)
)
ggsave('SARIMA_14.svg', device = 'svg')