Forecasting with neural networks via neuralnet package

Mikhail Popov

2017-05-17

Packages

suppressPackageStartupMessages(library(tidyverse))
library(maltese)
library(neuralnet)
suppressPackageStartupMessages(library(dummy))

Data

In this example, we’ll be forecasting pageviews of an article on English Wikipedia about R. The dataset was acquired using Wikimedia Foundation’s Pageviews API and the pageviews R package.

data("r_enwiki", package = "maltese")
head(r_enwiki)
##         date pageviews
## 1 2015-10-01      3072
## 2 2015-10-02      2575
## 3 2015-10-03      1431
## 4 2015-10-04      1540
## 5 2015-10-05      3041
## 6 2015-10-06      3695
ggplot(r_enwiki, aes(x = date, y = pageviews)) +
  geom_line() +
  theme_minimal() +
  labs(x = "Date", y = "Pageviews",
       title = "<https://en.wikipedia.org/wiki/R_(programming_language)> pageviews")

Normalization

You’ll want to use some sort of normalization with your outcome because there’s a good chance you’ll run into problems if you don’t. I did when I kept the pageviews in the thousands.

# How much of the data to use for training vs validation:
split_point <- "2017-02-01"
table(ifelse(r_enwiki$date < split_point, "training set", "testing set"))
## 
##  testing set training set 
##          105          489
# Save for later use (when converting back to original scale):
normalization_constants <- lapply(
  list(median = median, mad = mad, mean = mean, std.dev = sd),
  do.call, args = list(x = r_enwiki$pageviews[r_enwiki$date < split_point])
)
r_enwiki$normalized <- (r_enwiki$pageviews - normalization_constants$mean)/normalization_constants$std.dev

Feature Space

mlts <- mlts_transform(r_enwiki, date, normalized, p = 21, extras = TRUE, extrasAsFactors = TRUE, granularity = "day")
str(mlts)
## 'data.frame':    573 obs. of  28 variables:
##  $ dt                  : POSIXct, format: "2015-10-22" "2015-10-23" ...
##  $ y                   : num  0.11557 -0.00226 -1.34961 -1.18311 0.77644 ...
##  $ mlts_extras_monthday: Ord.factor w/ 31 levels "1"<"2"<"3"<"4"<..: 22 23 24 25 26 27 28 29 30 31 ...
##  $ mlts_extras_weekday : Ord.factor w/ 7 levels "Monday"<"Tuesday"<..: 4 5 6 7 1 2 3 4 5 6 ...
##  $ mlts_extras_week    : Ord.factor w/ 53 levels "1"<"2"<"3"<"4"<..: 43 43 43 43 43 43 43 44 44 44 ...
##  $ mlts_extras_month   : Ord.factor w/ 12 levels "January"<"February"<..: 10 10 10 10 10 10 10 10 10 10 ...
##  $ mlts_extras_year    : Ord.factor w/ 3 levels "2015"<"2016"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ mlts_lag_1          : num  0.44729 0.11557 -0.00226 -1.34961 -1.18311 ...
##  $ mlts_lag_2          : num  0.66246 0.44729 0.11557 -0.00226 -1.34961 ...
##  $ mlts_lag_3          : num  0.73802 0.66246 0.44729 0.11557 -0.00226 ...
##  $ mlts_lag_4          : num  -1.09 0.738 0.662 0.447 0.116 ...
##  $ mlts_lag_5          : num  -1.207 -1.09 0.738 0.662 0.447 ...
##  $ mlts_lag_6          : num  0.459 -1.207 -1.09 0.738 0.662 ...
##  $ mlts_lag_7          : num  0.793 0.459 -1.207 -1.09 0.738 ...
##  $ mlts_lag_8          : num  1.024 0.793 0.459 -1.207 -1.09 ...
##  $ mlts_lag_9          : num  0.937 1.024 0.793 0.459 -1.207 ...
##  $ mlts_lag_10         : num  0.661 0.937 1.024 0.793 0.459 ...
##  $ mlts_lag_11         : num  -0.784 0.661 0.937 1.024 0.793 ...
##  $ mlts_lag_12         : num  -1.052 -0.784 0.661 0.937 1.024 ...
##  $ mlts_lag_13         : num  0.477 -1.052 -0.784 0.661 0.937 ...
##  $ mlts_lag_14         : num  0.979 0.477 -1.052 -0.784 0.661 ...
##  $ mlts_lag_15         : num  1.116 0.979 0.477 -1.052 -0.784 ...
##  $ mlts_lag_16         : num  1.513 1.116 0.979 0.477 -1.052 ...
##  $ mlts_lag_17         : num  0.675 1.513 1.116 0.979 0.477 ...
##  $ mlts_lag_18         : num  -1.247 0.675 1.513 1.116 0.979 ...
##  $ mlts_lag_19         : num  -1.387 -1.247 0.675 1.513 1.116 ...
##  $ mlts_lag_20         : num  0.0784 -1.3867 -1.2471 0.6753 1.5129 ...
##  $ mlts_lag_21         : num  0.715 0.0784 -1.3867 -1.2471 0.6753 ...
# Convert factors to dummy variables because neuralnet only supports numeric features:
mlts_categories <- categories(mlts[, c("mlts_extras_weekday", "mlts_extras_month", "mlts_extras_monthday", "mlts_extras_week"), drop = FALSE])
mlts_dummied <- cbind(mlts, dummy(
  mlts[, c("mlts_extras_weekday", "mlts_extras_month", "mlts_extras_monthday", "mlts_extras_week"),
       drop = FALSE],
  object = mlts_categories, int = TRUE
))
str(mlts_dummied, list.len = 30)
## 'data.frame':    573 obs. of  131 variables:
##  $ dt                           : POSIXct, format: "2015-10-22" "2015-10-23" ...
##  $ y                            : num  0.11557 -0.00226 -1.34961 -1.18311 0.77644 ...
##  $ mlts_extras_monthday         : Ord.factor w/ 31 levels "1"<"2"<"3"<"4"<..: 22 23 24 25 26 27 28 29 30 31 ...
##  $ mlts_extras_weekday          : Ord.factor w/ 7 levels "Monday"<"Tuesday"<..: 4 5 6 7 1 2 3 4 5 6 ...
##  $ mlts_extras_week             : Ord.factor w/ 53 levels "1"<"2"<"3"<"4"<..: 43 43 43 43 43 43 43 44 44 44 ...
##  $ mlts_extras_month            : Ord.factor w/ 12 levels "January"<"February"<..: 10 10 10 10 10 10 10 10 10 10 ...
##  $ mlts_extras_year             : Ord.factor w/ 3 levels "2015"<"2016"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ mlts_lag_1                   : num  0.44729 0.11557 -0.00226 -1.34961 -1.18311 ...
##  $ mlts_lag_2                   : num  0.66246 0.44729 0.11557 -0.00226 -1.34961 ...
##  $ mlts_lag_3                   : num  0.73802 0.66246 0.44729 0.11557 -0.00226 ...
##  $ mlts_lag_4                   : num  -1.09 0.738 0.662 0.447 0.116 ...
##  $ mlts_lag_5                   : num  -1.207 -1.09 0.738 0.662 0.447 ...
##  $ mlts_lag_6                   : num  0.459 -1.207 -1.09 0.738 0.662 ...
##  $ mlts_lag_7                   : num  0.793 0.459 -1.207 -1.09 0.738 ...
##  $ mlts_lag_8                   : num  1.024 0.793 0.459 -1.207 -1.09 ...
##  $ mlts_lag_9                   : num  0.937 1.024 0.793 0.459 -1.207 ...
##  $ mlts_lag_10                  : num  0.661 0.937 1.024 0.793 0.459 ...
##  $ mlts_lag_11                  : num  -0.784 0.661 0.937 1.024 0.793 ...
##  $ mlts_lag_12                  : num  -1.052 -0.784 0.661 0.937 1.024 ...
##  $ mlts_lag_13                  : num  0.477 -1.052 -0.784 0.661 0.937 ...
##  $ mlts_lag_14                  : num  0.979 0.477 -1.052 -0.784 0.661 ...
##  $ mlts_lag_15                  : num  1.116 0.979 0.477 -1.052 -0.784 ...
##  $ mlts_lag_16                  : num  1.513 1.116 0.979 0.477 -1.052 ...
##  $ mlts_lag_17                  : num  0.675 1.513 1.116 0.979 0.477 ...
##  $ mlts_lag_18                  : num  -1.247 0.675 1.513 1.116 0.979 ...
##  $ mlts_lag_19                  : num  -1.387 -1.247 0.675 1.513 1.116 ...
##  $ mlts_lag_20                  : num  0.0784 -1.3867 -1.2471 0.6753 1.5129 ...
##  $ mlts_lag_21                  : num  0.715 0.0784 -1.3867 -1.2471 0.6753 ...
##  $ mlts_extras_weekday_Monday   : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ mlts_extras_weekday_Tuesday  : int  0 0 0 0 0 1 0 0 0 0 ...
##   [list output truncated]

Training

# Split:
training_idx <- which(mlts_dummied$dt < split_point)
testing_idx <- which(mlts_dummied$dt >= split_point)
# neuralnet does not support the "y ~ ." formula syntax, so we cheat:
nn_features <- grep("(mlts_lag_[0-9]+)|(mlts_extras_((weekday)|(month)|(monthday)|(week))_.*)", names(mlts_dummied), value = TRUE)
nn_formula <- as.formula(paste("y ~", paste(nn_features, collapse = " + ")))

# Train:
set.seed(0)
nn_model <- neuralnet(
  nn_formula, mlts_dummied[training_idx, c("y", nn_features)],
  linear.output = TRUE, hidden = c(9, 7, 5), algorithm = "sag"
)

Assessment

# Predict:
nn_predictions <- as.numeric(neuralnet::compute(
  nn_model, mlts_dummied[testing_idx, nn_features])$net.result
)
# Re-scale:
predictions <- data.frame(
  date = mlts_dummied$dt[testing_idx],
  normalized = nn_predictions,
  denormalized = (nn_predictions * normalization_constants$std.dev) + normalization_constants$mean
)

Let’s see how our simple neural network performed:

ggplot(dplyr::filter(r_enwiki, date >= "2016-10-01"),
       aes(x = date, y = pageviews)) +
  geom_line() +
  geom_line(aes(y = denormalized), color = "red",
            data = predictions) +
  theme_minimal() +
  labs(x = "Date", y = "Pageviews",
       title = "Forecast of <https://en.wikipedia.org/wiki/R_(programming_language)> pageviews",
       subtitle = "Red is for predictions made with a neural network with 3 layers containing 9, 7, and 5 hidden neurons, respectively")

Forecasting

One Forecast

Version 0.1.3 of the dummy package has a sapply-related bug when the provided data.frame only has 1 row. The maintainer has been notified and said they will fix it. For now, we have to use the following “hack” and have an extra row in there that we will throw away:

new_data <- rbind(
  tail(r_enwiki, 22),
  data.frame(
    date = as.Date("2017-05-17"),
    pageviews = NA,
    normalized = NA
  )
)
# new_data will have two rows, only the second of which we actually care about
new_mlts <- mlts_transform(new_data, date, normalized, p = 21, extras = TRUE, extrasAsFactors = TRUE, granularity = "day")
new_mlts <- cbind(
  new_mlts[-1, ], # don't need to forecast known outcome
  dummy(
    new_mlts[, c("mlts_extras_weekday", "mlts_extras_month", "mlts_extras_monthday", "mlts_extras_week"),
             drop = FALSE],
    object = mlts_categories, int = TRUE
  )[-1, ]
)

# Forecast on normalized scale:
nn_forecast <- as.numeric(neuralnet::compute(
  nn_model, new_mlts[, nn_features])$net.result
)

# Re-scale forecast to original scale:
(nn_forecast * normalization_constants$std.dev) + normalization_constants$mean
## [1] 3089.06325

Multiple Forecasts

We could modify the code above to be a loop that uses the model to make forecasts and uses those forecasts as inputs to make more forecasts:

new_data <- rbind(
  tail(r_enwiki, 22),
  data.frame(
    date = seq(
      as.Date("2017-05-17"),
      as.Date("2017-05-17") + 90,
      "day"
    ),
    pageviews = NA,
    normalized = NA
  )
); rownames(new_data) <- NULL

for (d in 23:nrow(new_data)) {
  new_mlts <- mlts_transform(
    new_data[(d - 22):d, ],
    date, normalized, p = 21,
    extras = TRUE, extrasAsFactors = TRUE,
    granularity = "day")
  new_mlts <- cbind(
    new_mlts[-1, ], # don't need to forecast known outcome
    dummy(
      new_mlts[, c("mlts_extras_weekday", "mlts_extras_month", "mlts_extras_monthday", "mlts_extras_week"),
               drop = FALSE],
      object = mlts_categories, int = TRUE
    )[-1, ]
  )
  new_data$normalized[d] <- as.numeric(neuralnet::compute(
    nn_model, new_mlts[, nn_features])$net.result
  )
}

new_data$pageviews <- (new_data$normalized * normalization_constants$std.dev) + normalization_constants$mean
ggplot(dplyr::filter(r_enwiki, date >= "2016-10-01"),
       aes(x = date, y = pageviews)) +
  geom_line() +
  geom_line(aes(y = pageviews), color = "red",
            data = dplyr::filter(new_data, date >= "2017-05-16")) +
  theme_minimal() +
  labs(x = "Date", y = "Pageviews",
       title = "90 day forecast of <https://en.wikipedia.org/wiki/R_(programming_language)> pageviews",
       subtitle = "Red is for predictions made with a neural network with 3 layers containing 9, 7, and 5 hidden neurons, respectively")