suppressPackageStartupMessages(library(tidyverse))
library(maltese)
library(neuralnet)
suppressPackageStartupMessages(library(dummy))
In this example, we’ll be forecasting pageviews of an article on English Wikipedia about R. The dataset was acquired using Wikimedia Foundation’s Pageviews API and the pageviews R package.
data("r_enwiki", package = "maltese")
head(r_enwiki)
## date pageviews
## 1 2015-10-01 3072
## 2 2015-10-02 2575
## 3 2015-10-03 1431
## 4 2015-10-04 1540
## 5 2015-10-05 3041
## 6 2015-10-06 3695
ggplot(r_enwiki, aes(x = date, y = pageviews)) +
geom_line() +
theme_minimal() +
labs(x = "Date", y = "Pageviews",
title = "<https://en.wikipedia.org/wiki/R_(programming_language)> pageviews")
You’ll want to use some sort of normalization with your outcome because there’s a good chance you’ll run into problems if you don’t. I did when I kept the pageviews in the thousands.
# How much of the data to use for training vs validation:
split_point <- "2017-02-01"
table(ifelse(r_enwiki$date < split_point, "training set", "testing set"))
##
## testing set training set
## 105 489
# Save for later use (when converting back to original scale):
normalization_constants <- lapply(
list(median = median, mad = mad, mean = mean, std.dev = sd),
do.call, args = list(x = r_enwiki$pageviews[r_enwiki$date < split_point])
)
r_enwiki$normalized <- (r_enwiki$pageviews - normalization_constants$mean)/normalization_constants$std.dev
mlts <- mlts_transform(r_enwiki, date, normalized, p = 21, extras = TRUE, extrasAsFactors = TRUE, granularity = "day")
str(mlts)
## 'data.frame': 573 obs. of 28 variables:
## $ dt : POSIXct, format: "2015-10-22" "2015-10-23" ...
## $ y : num 0.11557 -0.00226 -1.34961 -1.18311 0.77644 ...
## $ mlts_extras_monthday: Ord.factor w/ 31 levels "1"<"2"<"3"<"4"<..: 22 23 24 25 26 27 28 29 30 31 ...
## $ mlts_extras_weekday : Ord.factor w/ 7 levels "Monday"<"Tuesday"<..: 4 5 6 7 1 2 3 4 5 6 ...
## $ mlts_extras_week : Ord.factor w/ 53 levels "1"<"2"<"3"<"4"<..: 43 43 43 43 43 43 43 44 44 44 ...
## $ mlts_extras_month : Ord.factor w/ 12 levels "January"<"February"<..: 10 10 10 10 10 10 10 10 10 10 ...
## $ mlts_extras_year : Ord.factor w/ 3 levels "2015"<"2016"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ mlts_lag_1 : num 0.44729 0.11557 -0.00226 -1.34961 -1.18311 ...
## $ mlts_lag_2 : num 0.66246 0.44729 0.11557 -0.00226 -1.34961 ...
## $ mlts_lag_3 : num 0.73802 0.66246 0.44729 0.11557 -0.00226 ...
## $ mlts_lag_4 : num -1.09 0.738 0.662 0.447 0.116 ...
## $ mlts_lag_5 : num -1.207 -1.09 0.738 0.662 0.447 ...
## $ mlts_lag_6 : num 0.459 -1.207 -1.09 0.738 0.662 ...
## $ mlts_lag_7 : num 0.793 0.459 -1.207 -1.09 0.738 ...
## $ mlts_lag_8 : num 1.024 0.793 0.459 -1.207 -1.09 ...
## $ mlts_lag_9 : num 0.937 1.024 0.793 0.459 -1.207 ...
## $ mlts_lag_10 : num 0.661 0.937 1.024 0.793 0.459 ...
## $ mlts_lag_11 : num -0.784 0.661 0.937 1.024 0.793 ...
## $ mlts_lag_12 : num -1.052 -0.784 0.661 0.937 1.024 ...
## $ mlts_lag_13 : num 0.477 -1.052 -0.784 0.661 0.937 ...
## $ mlts_lag_14 : num 0.979 0.477 -1.052 -0.784 0.661 ...
## $ mlts_lag_15 : num 1.116 0.979 0.477 -1.052 -0.784 ...
## $ mlts_lag_16 : num 1.513 1.116 0.979 0.477 -1.052 ...
## $ mlts_lag_17 : num 0.675 1.513 1.116 0.979 0.477 ...
## $ mlts_lag_18 : num -1.247 0.675 1.513 1.116 0.979 ...
## $ mlts_lag_19 : num -1.387 -1.247 0.675 1.513 1.116 ...
## $ mlts_lag_20 : num 0.0784 -1.3867 -1.2471 0.6753 1.5129 ...
## $ mlts_lag_21 : num 0.715 0.0784 -1.3867 -1.2471 0.6753 ...
# Convert factors to dummy variables because neuralnet only supports numeric features:
mlts_categories <- categories(mlts[, c("mlts_extras_weekday", "mlts_extras_month", "mlts_extras_monthday", "mlts_extras_week"), drop = FALSE])
mlts_dummied <- cbind(mlts, dummy(
mlts[, c("mlts_extras_weekday", "mlts_extras_month", "mlts_extras_monthday", "mlts_extras_week"),
drop = FALSE],
object = mlts_categories, int = TRUE
))
str(mlts_dummied, list.len = 30)
## 'data.frame': 573 obs. of 131 variables:
## $ dt : POSIXct, format: "2015-10-22" "2015-10-23" ...
## $ y : num 0.11557 -0.00226 -1.34961 -1.18311 0.77644 ...
## $ mlts_extras_monthday : Ord.factor w/ 31 levels "1"<"2"<"3"<"4"<..: 22 23 24 25 26 27 28 29 30 31 ...
## $ mlts_extras_weekday : Ord.factor w/ 7 levels "Monday"<"Tuesday"<..: 4 5 6 7 1 2 3 4 5 6 ...
## $ mlts_extras_week : Ord.factor w/ 53 levels "1"<"2"<"3"<"4"<..: 43 43 43 43 43 43 43 44 44 44 ...
## $ mlts_extras_month : Ord.factor w/ 12 levels "January"<"February"<..: 10 10 10 10 10 10 10 10 10 10 ...
## $ mlts_extras_year : Ord.factor w/ 3 levels "2015"<"2016"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ mlts_lag_1 : num 0.44729 0.11557 -0.00226 -1.34961 -1.18311 ...
## $ mlts_lag_2 : num 0.66246 0.44729 0.11557 -0.00226 -1.34961 ...
## $ mlts_lag_3 : num 0.73802 0.66246 0.44729 0.11557 -0.00226 ...
## $ mlts_lag_4 : num -1.09 0.738 0.662 0.447 0.116 ...
## $ mlts_lag_5 : num -1.207 -1.09 0.738 0.662 0.447 ...
## $ mlts_lag_6 : num 0.459 -1.207 -1.09 0.738 0.662 ...
## $ mlts_lag_7 : num 0.793 0.459 -1.207 -1.09 0.738 ...
## $ mlts_lag_8 : num 1.024 0.793 0.459 -1.207 -1.09 ...
## $ mlts_lag_9 : num 0.937 1.024 0.793 0.459 -1.207 ...
## $ mlts_lag_10 : num 0.661 0.937 1.024 0.793 0.459 ...
## $ mlts_lag_11 : num -0.784 0.661 0.937 1.024 0.793 ...
## $ mlts_lag_12 : num -1.052 -0.784 0.661 0.937 1.024 ...
## $ mlts_lag_13 : num 0.477 -1.052 -0.784 0.661 0.937 ...
## $ mlts_lag_14 : num 0.979 0.477 -1.052 -0.784 0.661 ...
## $ mlts_lag_15 : num 1.116 0.979 0.477 -1.052 -0.784 ...
## $ mlts_lag_16 : num 1.513 1.116 0.979 0.477 -1.052 ...
## $ mlts_lag_17 : num 0.675 1.513 1.116 0.979 0.477 ...
## $ mlts_lag_18 : num -1.247 0.675 1.513 1.116 0.979 ...
## $ mlts_lag_19 : num -1.387 -1.247 0.675 1.513 1.116 ...
## $ mlts_lag_20 : num 0.0784 -1.3867 -1.2471 0.6753 1.5129 ...
## $ mlts_lag_21 : num 0.715 0.0784 -1.3867 -1.2471 0.6753 ...
## $ mlts_extras_weekday_Monday : int 0 0 0 0 1 0 0 0 0 0 ...
## $ mlts_extras_weekday_Tuesday : int 0 0 0 0 0 1 0 0 0 0 ...
## [list output truncated]
# Split:
training_idx <- which(mlts_dummied$dt < split_point)
testing_idx <- which(mlts_dummied$dt >= split_point)
# neuralnet does not support the "y ~ ." formula syntax, so we cheat:
nn_features <- grep("(mlts_lag_[0-9]+)|(mlts_extras_((weekday)|(month)|(monthday)|(week))_.*)", names(mlts_dummied), value = TRUE)
nn_formula <- as.formula(paste("y ~", paste(nn_features, collapse = " + ")))
# Train:
set.seed(0)
nn_model <- neuralnet(
nn_formula, mlts_dummied[training_idx, c("y", nn_features)],
linear.output = TRUE, hidden = c(9, 7, 5), algorithm = "sag"
)
# Predict:
nn_predictions <- as.numeric(neuralnet::compute(
nn_model, mlts_dummied[testing_idx, nn_features])$net.result
)
# Re-scale:
predictions <- data.frame(
date = mlts_dummied$dt[testing_idx],
normalized = nn_predictions,
denormalized = (nn_predictions * normalization_constants$std.dev) + normalization_constants$mean
)
Let’s see how our simple neural network performed:
ggplot(dplyr::filter(r_enwiki, date >= "2016-10-01"),
aes(x = date, y = pageviews)) +
geom_line() +
geom_line(aes(y = denormalized), color = "red",
data = predictions) +
theme_minimal() +
labs(x = "Date", y = "Pageviews",
title = "Forecast of <https://en.wikipedia.org/wiki/R_(programming_language)> pageviews",
subtitle = "Red is for predictions made with a neural network with 3 layers containing 9, 7, and 5 hidden neurons, respectively")
Version 0.1.3 of the dummy package has a sapply-related bug when the provided data.frame only has 1 row. The maintainer has been notified and said they will fix it. For now, we have to use the following “hack” and have an extra row in there that we will throw away:
new_data <- rbind(
tail(r_enwiki, 22),
data.frame(
date = as.Date("2017-05-17"),
pageviews = NA,
normalized = NA
)
)
# new_data will have two rows, only the second of which we actually care about
new_mlts <- mlts_transform(new_data, date, normalized, p = 21, extras = TRUE, extrasAsFactors = TRUE, granularity = "day")
new_mlts <- cbind(
new_mlts[-1, ], # don't need to forecast known outcome
dummy(
new_mlts[, c("mlts_extras_weekday", "mlts_extras_month", "mlts_extras_monthday", "mlts_extras_week"),
drop = FALSE],
object = mlts_categories, int = TRUE
)[-1, ]
)
# Forecast on normalized scale:
nn_forecast <- as.numeric(neuralnet::compute(
nn_model, new_mlts[, nn_features])$net.result
)
# Re-scale forecast to original scale:
(nn_forecast * normalization_constants$std.dev) + normalization_constants$mean
## [1] 3089.06325
We could modify the code above to be a loop that uses the model to make forecasts and uses those forecasts as inputs to make more forecasts:
new_data <- rbind(
tail(r_enwiki, 22),
data.frame(
date = seq(
as.Date("2017-05-17"),
as.Date("2017-05-17") + 90,
"day"
),
pageviews = NA,
normalized = NA
)
); rownames(new_data) <- NULL
for (d in 23:nrow(new_data)) {
new_mlts <- mlts_transform(
new_data[(d - 22):d, ],
date, normalized, p = 21,
extras = TRUE, extrasAsFactors = TRUE,
granularity = "day")
new_mlts <- cbind(
new_mlts[-1, ], # don't need to forecast known outcome
dummy(
new_mlts[, c("mlts_extras_weekday", "mlts_extras_month", "mlts_extras_monthday", "mlts_extras_week"),
drop = FALSE],
object = mlts_categories, int = TRUE
)[-1, ]
)
new_data$normalized[d] <- as.numeric(neuralnet::compute(
nn_model, new_mlts[, nn_features])$net.result
)
}
new_data$pageviews <- (new_data$normalized * normalization_constants$std.dev) + normalization_constants$mean
ggplot(dplyr::filter(r_enwiki, date >= "2016-10-01"),
aes(x = date, y = pageviews)) +
geom_line() +
geom_line(aes(y = pageviews), color = "red",
data = dplyr::filter(new_data, date >= "2017-05-16")) +
theme_minimal() +
labs(x = "Date", y = "Pageviews",
title = "90 day forecast of <https://en.wikipedia.org/wiki/R_(programming_language)> pageviews",
subtitle = "Red is for predictions made with a neural network with 3 layers containing 9, 7, and 5 hidden neurons, respectively")