Merge branch 'enhancement/time-varying-error'

JGCRI · Feb 13, 2024 · 4766d9e · 4766d9e
2 parents 048e4bd + 4238827
commit 4766d9e
Show file tree

Hide file tree

Showing 7 changed files with 127 additions and 65 deletions.
diff --git a/R/RMSE_calc.R b/R/RMSE_calc.R
@@ -5,7 +5,12 @@
 #'
 #' @param x A vector of modeled data values
 #' @param y A vector of observed data values
-#'
+#' @param sigma A single value or vector of error terms the same length as y.
+#' A single value will apply a constant error term. User can provide a vector
+#' of error terms to incorporate time-varying error into the RMSE calculation.
+#' `sigma` default assume homoscedasticity of residuals and applies a constant
+#' error term equal to the standard deviation of observed data (scoring criterion).
+
 #' @return Returns a vector of RMSE values
 #' @export
 #'
@@ -14,14 +19,25 @@
 #' y <- c(5:9)
 #'
 #' RMSE_calc(x, y)
-RMSE_calc <- function(x, y) {
+RMSE_calc <- function(x, y, sigma = sd(y)) {
   # Check if all values are NA
   if (all(is.na(x)) || all(is.na(y))) {
     return(NA)
   }
 
-  # compute RMSE
-  rmse_vals <- sqrt(mean((x - y)^2))
+  # if user provides sigma values compute RMSE with error.
+  # sigma should be length = 1 or same length as the obs data.
+  if (length(sigma) != 1 && length(sigma) != length(y))
+    stop(
+      paste(
+        "Length of sigma must be a single value or a vector matching the length of observed data = ",
+        length(y),
+        ".",
+        sep = ""
+      )
+    )
+
+  rmse_vals <- sqrt(mean(((x - y) / sigma) ^ 2))
 
   # return a vector of RMSE values
   return(rmse_vals)

diff --git a/R/iterate_model.R b/R/iterate_model.R
@@ -147,16 +147,16 @@ iterate_model <- function(core, params, save_years = NULL, save_vars = NULL) {
       },
       # if Hector crashes because of parameter combinations, send error message
       error = function(e) {
-        message("An error occurred")
+        message("An error occurred", i)
       }
     )
 
     # Create a placeholder dataframe for the run if there's no data collected
     if (length(result_list) < i) {
       dat <- data.frame(
-        scenario = core$name,
+        scenario = rep(core$name, each = length(save_years)),
         year = save_years,
-        variable = save_vars,
+        variable = rep(save_vars, each = length(save_years)),
         value = NA,
         units = NA,
         run_number = i

diff --git a/R/score_bayesian.R b/R/score_bayesian.R
@@ -10,11 +10,17 @@
 #' @param m A Matrix of values. The first column of the matrix should be
 #' a vector of observed data for a give variable. Subsequent vectors should be
 #' representative of modeled values for a given variable.
-#' @param sigma Numeric value (optional). The standard deviation parameter for
-#' the normal distribution used in the Bayesian analysis. If not provided, the
-#' function will automatically compute it as the standard deviation of the
-#' Root Mean Square Error (RMSE). A smaller value of `sigma` will make the
-#' Bayesian analysis give more weight to models with lower RMSE values.
+#' @param sigma Numeric value (optional). A single value or vector of error terms
+#' the same length as y. A single value will apply a constant error term. User
+#' can provide a vector of error terms to incorporate time-varying error
+#' to the RMSE calculation. `sigma` default assume homoscedasticity of residuals
+#' and applies a constant error term equal to the standard deviation of observed
+#' data (scoring criterion).
+#' @param sensitivity A multiplier that adjusts the sensitivity of the likelihood
+#' values to increasing RMSE. If not provided, the function will automatically
+#' calculate the sensitivity as one unit of standard deviation of the RMSE results.
+#' A smaller sensitivity value will make the Bayesian analysis give more weight
+#' to models with lower RMSE values.
 #'
 #' @note Note: In Bayesian statistics, the choice of `sigma` can significantly
 #' impact the results and conclusions of the analysis. Users are encouraged to
@@ -28,25 +34,33 @@
 #'
 #' @examples
 #' # creating sample matrix
-#' mat <- matrix(data = 1:15, nrow = 5, ncol = 3)
+#' mat <- matrix(data = 1:20, nrow = 5, ncol = 4)
 #'
 #' # scoring with a decay rate of 2
-#' score_bayesian(mat, sigma = 2)
-score_bayesian <- function(m, sigma = NULL) {
+#' score_bayesian(mat, sensitivity = 2)
+score_bayesian <- function(m,
+                           sigma = NULL,
+                           sensitivity = NULL) {
   # initialize vector to store RMSE values from loop
-  rmse_vector <- numeric()
+  rmse_vector <- rep(NA_real_, ncol(m))
 
   # Stop execution if number of columns in the matrix is less than 2
   # indicates that there is only one model result stored in matrix
-  stopifnot("More than 2 columns must be included in input matrix" = ncol(m) > 2)
+  stopifnot("More than 3 columns must be included in input matrix" = ncol(m) > 3)
 
   # indicate that observed data are in the first column of the matrix
   obs_data <- m[, 1]
 
   # error if the observed data has no non-NA values
-  if (all(is.na(obs_data))) stop("No non-NA values present in observed data.")
+  if (all(is.na(obs_data)))
+    stop("No non-NA values present in observed data.")
 
-  # loop across columns of the matrix. For each column (i) after col 2
+  # If sigma is not provided, use sd(y) as default
+  if (is.null(sigma)) {
+    sigma <- sd(obs_data)
+  }
+
+  # loop across columns of the matrix. For each column (i) starting with col 2
   for (i in 2:ncol(m)) {
     # indicate modeled data are in subsequent columns
     model_data <- m[, i]
@@ -55,35 +69,48 @@ score_bayesian <- function(m, sigma = NULL) {
     if (all(is.na(model_data))) {
       rmse_vals <- NA # Set RMSE to NA for this column
     } else {
-      rmse_vals <- RMSE_calc(obs_data, model_data)
+      rmse_vals <- RMSE_calc(model_data, obs_data, sigma = sigma)
     }
 
     # vector of RMSE value for each model iteration
     rmse_vector[i] <- rmse_vals
   }
 
-  # Compute sigma if not provided by the user
-  if (is.null(sigma)) {
-    sigma <- sd(obs_data) # Calculate sigma as the standard deviation of RMSE values
-  }
+  # Check if sensitivity is negative, if so throw error
+  if (any(!is.na(sensitivity) &
+          (sensitivity < 0)))
+    stop("Sensitivity cannot be negative.")
 
-  # Check if sigma is negative, if so throw error
-  if (!is.na(sigma) && sigma < 0) warning("sigma value cannot be negative.")
+  # Compute likelihood sensitivity using multiplier provided by the user
+  if (is.null(sensitivity)) {
+    sensitivity_value <-
+      sd(rmse_vector, na.rm = TRUE) # Calculate sensitivity as the standard deviation of the RMSE results
+  } else {
+    if (length(sensitivity) != 1)
+      stop("Sensitivity must be a single value.")
+    sensitivity_value <- sensitivity * sd(rmse_vector, na.rm = TRUE)
+  }
+  if (!is.na(sensitivity_value) && sensitivity_value == 0) {
+    likelihood <- rep(1, length(rmse_vector[-1]))
+    return(likelihood)
+  }
 
   # Compute likelihood using normal distribution likelihood function.
   # This is the probability of observing the modeled data given the
   # observed data.
   # Remove first value when calling rmse_vector (first values should be NA because
   # it represented obs_data)
-  likelihood <- exp(-0.5 * ((rmse_vector[-1]) / sigma)^2)
+  rmse_vector <- rmse_vector[-1]
+  likelihood <- exp(-0.5 * (rmse_vector / sensitivity_value) ^ 2)
 
   # Computing unnormalized posterior scores
   # Currently only computing posterior scores using uniform prior.
   # uniform prior is calculated as 1/length(likelihood) which is
   # the same as 1 / # of runs.
-  posterior <- likelihood * (1 / length(likelihood))
+  # posterior <-
+  #  likelihood * (1 / length(likelihood)) ### DONT THINK THIS IS A NECESSARY STEP MAY JUST WANT TO RETURN "LIKELIHOOD" ###
 
   # Create data frame of results - get run_numbers from the list where RMSE values
   # are computed (names of the split_list components)
-  return(posterior)
+  return(likelihood)
 }
diff --git a/man/RMSE_calc.Rd b/man/RMSE_calc.Rd
diff --git a/man/score_bayesian.Rd b/man/score_bayesian.Rd
diff --git a/tests/testthat/test-RMSE_calc.R b/tests/testthat/test-RMSE_calc.R
@@ -7,11 +7,21 @@ test_that("NAs passed to function are treated correctly", {
 # test RMSE functionality
 
 test_that("RMSE is computed accurately", {
-  expect_equal(RMSE_calc(1, 5), sqrt(mean(1 - 5)^2))
+  expect_equal(RMSE_calc(1, 5), sqrt((mean(1-5)/sd(5)^2)))
 })
 
 # test return is vector
 
 test_that("RMSE return is a vector", {
   expect_vector(RMSE_calc(x = 1:3, y = 3:5))
 })
+
+# tests for sigma
+test_that("sigma is a single value", {
+  expect_equal(RMSE_calc(1, 5, sigma = 1), sqrt(((mean(1-5)/1)^2)))
+})
+
+test_that("error if sigma is a vector of a different length than y", {
+  expect_error(RMSE_calc(1, 5, sigma = c(1, 2, 3)),
+               regexp = "Length of sigma must be a single value or a vector matching the length of observed data = 1")
+})
diff --git a/tests/testthat/test-score_bayesian.R b/tests/testthat/test-score_bayesian.R
@@ -4,11 +4,11 @@ m <- matrix(c(rep(1, 6), rep(2, 3), rep(3, 3)), nrow = 3, ncol = 4)
 # Testing error/warning cases
 
 test_that("function stops and produces error messages", {
-  # error when matrix has less than two columns
+  # error when matrix has less than three columns
   # m with only 2 cols
-  m_2row <- matrix(data = c(1:2), nrow = 2, ncol = 2)
-  expect_error(score_bayesian(m_2row),
-    regexp = "More than 2 columns must be included in input matrix"
+  m_2col <- matrix(data = c(1:6), nrow = 3, ncol = 2)
+  expect_error(score_bayesian(m_2col),
+    regexp = "More than 3 columns must be included in input matrix"
   )
 
   # error when entire m is NA
@@ -18,34 +18,30 @@ test_that("function stops and produces error messages", {
     regexp = "No non-NA values present in observed data"
   )
 
-  # warning when user supplies negative sigma values
-  expect_warning(score_bayesian(m, -1),
-    regexp = "sigma value cannot be negative."
+  # error when user supplies negative sensitivity value
+  expect_error(score_bayesian(m, sensitivity = -1),
+    regexp = "Sensitivity cannot be negative."
   )
+
+  # error when user supplies more than one sensitivity value
+  expect_error(score_bayesian(m, sensitivity = c(1, 2)),
+               regexp = "Sensitivity must be a single value.")
 })
 
 # Testing output accuracy
 
 test_that("scores assessed correctly", {
   # If entire column of modeled data is NA, set RMSE value to NA
-  m_NA <- matrix(data = c(rep(1, 3), rep(1, 3), rep(NA, 3)), nrow = 3, ncol = 3)
-  expect_equal(score_bayesian(m_NA, 2), c(0.5, NA))
+  m_NA <- matrix(data = c(rep(1, 3), rep(1, 3), rep(2, 3), rep(NA, 3)), nrow = 3, ncol = 4)
+  test_result_1 <- score_bayesian(m_NA, sigma = 1)
+  expect_equal(test_result_1 [3], as.numeric(NA))
 
   # scores equal when calculated RMSE = 0
-  m2 <- matrix(data = c(1, 1, 1), nrow = 1, ncol = 3)
-  expect_equal(score_bayesian(m2, 2), c(0.5, 0.5))
-
-  # when sigma = 0 all scores are identical
-  m3 <- matrix(data = c(1:3), nrow = 1, ncol = 3)
-  result <- score_bayesian(m3, 0)
-  expect_identical(result, c(rep(result[1], length(result))))
-
-  # expected output of function matches gaussian likelihood
-  m4 <- matrix(data = c(1, 2, 2), nrow = 1, ncol = 3)
-  sigma4 <- 2
-  expected_likelihood <- exp(-0.5 * (m4[1]^2) / sigma4^2) / m4[1, -1]
-  expect_equal(
-    score_bayesian(m4, sigma4),
-    expected_likelihood
-  )
-})
+  m2 <- matrix(data = c(rep(1,3), rep(1, 3), rep(1, 3), rep(1, 3)), nrow = 3, ncol = 4)
+  expect_equal(score_bayesian(m2, sigma = 1), c(1, 1, 1))
+
+  # when sensitivity = 0 all scores are identical
+  m3 <- matrix(data = c(1:4), nrow = 1, ncol = 4)
+  expect_equal(score_bayesian(m3, sigma = 1, sensitivity = 0), c(1, 1, 1))
+
+ })