## ----include = FALSE---------------------------------------------------------- is_check <- ("CheckExEnv" %in% search()) || any(c("_R_CHECK_TIMINGS_", "_R_CHECK_LICENSE_") %in% names(Sys.getenv())) knitr::opts_chunk$set( collapse = TRUE, comment = "#>"#,eval = !is_check ) ## ----setup-------------------------------------------------------------------- library(ezECM) ## ----datagen------------------------------------------------------------------ data_gen <- function(p = NULL, K = 3, Ntest = NULL, Ntrain = NULL, Ntrain_missing = NULL, tst_missing = NULL, trn_missing = NULL){ mu_use <- matrix(rnorm(n = p*K, sd = 0.5), ncol = K, nrow = p) Y <- list() S <- list() ## random number in each class N <- LaplacesDemon::rcat(n = Ntest + Ntrain + Ntrain_missing, p = rep(1/3, times = K)) N <- as.vector(table(N)) ## random very important category vic <- sample(1:K, size = 1) for(k in 1:K){ ## random number of blocks in kth category nblocks <- sample(1:2, size = 1) ## random covariance matrix for kth category S[[k]] <- LaplacesDemon::rinvwishart(nu = p + 4, S = diag(p)) ## If relevant, delete entries to form block covariance matrices of random sizes if(nblocks == 2){ nblock1 <- sample(1:(p-1), size = 1) block1_members <- sample(1:p, size = nblock1, replace = FALSE) block2_members <- (1:p)[-block1_members] zero_elements <- expand.grid(block1_members, block2_members) S[[k]][zero_elements$Var1, zero_elements$Var2] <- 0 S[[k]][zero_elements$Var2, zero_elements$Var1] <- 0 } ## data for kth class is drawn from a MVN, given the mean and covariance Y[[k]] <- as.data.frame(LaplacesDemon::rmvn(n = N[k], mu = mu_use[,k], Sigma= S[[k]])) ## data is transformed to (0,1) using the logistic function to run a check Ytemp<- 1/(1+ exp(-Y[[k]])) ## if machine precision causes the output of the logistic function to round to 1, the experiment is stopped ## this has never happened if(max(apply(Ytemp,2,function(X){range(X)})) == 1){ stop() }else{ Y[[k]]<- 1/(1+ exp(-Y[[k]])) } ## a column for the event class is appended to the data.frame Y[[k]] <- cbind(Y[[k]], as.character(k)) names(Y[[k]]) <- c(paste("p",as.character(1:p), sep = ""), "event") } Y <- do.call("rbind", Y) ## A random sample of the data is taken to be the testing set test_index <- sample(1:nrow(Y), size = Ntest) testing <- Y[test_index,] ## The remainder is slated for training training <- Y[-test_index, ] ## A random sample of the training set is set aside to have missing entries, ## the remainder is set aside to be the fully observed training set train_full_index <- sample(1:nrow(training), size = Ntrain) train_missing <- training[-train_full_index, ] train_full <- training[train_full_index, ] ## If all event categories are not represented in the training set of full observations, ## the sampling scheme repeats if(any(table(train_full$event) <= 1) | length(table(train_full$event)) <= (K-1)){ while(any(table(train_full$event) <= 1 | length(table(train_full$event)) <= (K-1))){ test_index <- sample(1:nrow(Y), size = Ntest) testing <- Y[test_index,] training <- Y[-test_index, ] train_full_index <- sample(1:nrow(training), size = Ntrain) train_missing <- training[-train_full_index, ] train_full <- training[train_full_index, ] } } ## the true event category for the testing set is saved as a seperate variable, ## and deleted from the data.frame containing the data test_truth <- testing$event testing$event <- NULL ## Entries are randomly selected for deletion from the testing set. ## This scheme ensures a single discriminant for each observation ## not deleted in order to reduce computation time. abs_present <- sample(size = Ntest, 1:p, replace = TRUE) missing_pool <- matrix(1:p, ncol = p, nrow = Ntest, byrow = TRUE) missing_pool <- t(apply(cbind(missing_pool, abs_present), 1, function(X,pp){ X[-c(X[pp + 1], pp +1)] }, pp = p)) missing_pool_save <- missing_pool frac_missing <- (p*tst_missing)/(p-1) # sample which of the remaining elements will be missing missing_sample <- sample(1:(nrow(missing_pool)*ncol(missing_pool)), size = floor(nrow(missing_pool)*ncol(missing_pool)*(frac_missing)), replace = FALSE) missing_pool_save[missing_sample] <- NA saved_data <- apply(cbind(missing_pool_save, unname(abs_present)), 1, function(X){ X[-which(is.na(X))] }) for(j in 1:nrow(testing)){ testing[j,-c(saved_data[[j]])] <- NA } ## Entries are randomly selected for deletion from the training set. ## This scheme ensures a single discriminant for each observation ## not deleted in order to reduce computation time. abs_present <- sample(size = Ntrain_missing, 1:p, replace = TRUE) abs_missing <- matrix(1:p, ncol = p, nrow = Ntrain_missing, byrow = TRUE) abs_missing <- t(apply(cbind(abs_missing, abs_present), 1, function(X,pp){ X[-c(X[pp + 1], pp +1)] }, pp = p)) abs_missing <- apply(abs_missing, 1, function(X){ sample(X, size = 1) }) missing_pool <- matrix(1:p, ncol = p, nrow = Ntrain_missing, byrow = TRUE) missing_pool <- t(apply(cbind(missing_pool, abs_present, abs_missing), 1, function(X,pp){ X[-c(X[pp + 1], X[pp + 2], pp +1, pp + 2)] }, pp = p)) missing_pool_save <- missing_pool frac_missing <- (p*trn_missing - 1)/(p-2) # sample which of the remaining elements will be missing missing_sample <- sample(1:(nrow(missing_pool)*ncol(missing_pool)), size = floor(nrow(missing_pool)*ncol(missing_pool)*(frac_missing)), replace = FALSE) missing_pool_save[missing_sample] <- NA saved_data <- apply(cbind(missing_pool_save, unname(abs_present)), 1, function(X){ X[-which(is.na(X))] }) for(j in 1:nrow(train_missing)){ train_missing[j,-c(saved_data[[j]], p+1)] <- NA } return(list(Y = list(train_full = train_full, train_missing = train_missing, testing = testing, test_truth = test_truth), params = list(mu = mu_use, Sig = S, vic = vic))) } ## ----------------------------------------------------------------------------- ## Data Parameters tst_missing <- 0.5 trn_missing <- 0.5 Ntrain <- 25 Ntest <- 100 Ntrain_missing <- 5 * Ntrain K <- 3 ## ----------------------------------------------------------------------------- P <- c(4,6) #P <- c(4,6,8,10) ## ----------------------------------------------------------------------------- alphatilde <- 0.05 ## ----------------------------------------------------------------------------- mixture_weights <- "training" ## ----------------------------------------------------------------------------- C01 <- matrix(c(0,1,1, 0), ncol = 2, nrow = 2) C01 ## ----------------------------------------------------------------------------- Cfneg <- C01 Cfneg[1,2] <- 2 Cfneg ## ----------------------------------------------------------------------------- Ccat <- matrix(1, ncol = 3, nrow = 3) - diag(3) Ccat ## ----------------------------------------------------------------------------- iters <- 3#250 ## ----------------------------------------------------------------------------- ## MCMC parameters BT <- c(150, 2150) #BT <- c(500, 50500) ## ----------------------------------------------------------------------------- thinning <- 5 ## ----------------------------------------------------------------------------- ## Experimental Parameters verb <- FALSE #verb <- TRUE ## ----------------------------------------------------------------------------- ## Data structures for saving progress cECM_recfp <- cECM_recfn <- bayes_rec <- cECM_rec <- matrix(NA, ncol = length(P), nrow = iters) Nvic <- rep(0, times = length(P)) exp_out <- list() method_template <- data.frame(matrix(NA, ncol = 3, nrow = iters)) names(method_template) <- c("accurate", "FN", "FP") data_template <- data.frame(matrix(NA, ncol = 2, nrow = iters)) names(data_template) <- c("Ntest", "Nvic") data_template$Ntest <- Ntest p_template <- list(cECM = method_template, becm = method_template, mbecm = method_template, mbecm_Cfn = method_template, mbecm_cat = method_template, data = data_template) bayes_rec <- cECM_rec <- matrix(NA, ncol = length(P), nrow = iters) if(verb){ toc <- Sys.time() } ## ----------------------------------------------------------------------------- ## Iterates over the differing numbers of total discriminants set for the experiment. for(p in P){ ## Builds the list for saving the results using a template list exp_out[[p]] <- p_template ## Runs each model for `iters` number of independent data sets. for(i in 1:iters){ ## The i^{th} run for p discriminants if(verb){ ## set the experimental parameter verb <- TRUE to print progress print(paste0("i = ", i, ", p = ", p, ", ", round(Sys.time() - toc, digits = 2), " ", units(Sys.time() - toc), " elapsed")) } ## Generate random data set Ylist <- data_gen(p = p, K = K, Ntest = Ntest, Ntrain = Ntrain, Ntrain_missing = Ntrain_missing, tst_missing = tst_missing, trn_missing = trn_missing) ## Saves the random data information to the environment train_full<- Ylist$Y$train_full train_missing <- Ylist$Y$train_missing testing <- Ylist$Y$testing test_truth <- Ylist$Y$test_truth ## Which category is the important one this time? vic <- Ylist$params$vic ## Save the true total number of `vic` events in the testing data to be used ## later for analyzing performance. exp_out[[p]][["data"]]$Nvic[i] <- sum(test_truth == as.character(vic)) ## Fit the classical ECM model, apply the decision framework with the ## `cECM_decision()` function, then save the results cECM <- cECM(x = train_full, transform = TRUE, newdata = testing) cECM_out <- apply(cECM_decision(pval = cECM, alphatilde = alphatilde, vic = as.character(vic), cat_truth = test_truth)$events[,1:3] ,2, sum, na.rm = TRUE) exp_out[[p]][["cECM"]][i,] <- unname(cECM_out) ## Fit the B-ECM model, using only full p observations bayes_fit <- BayesECM(Y = train_full) ## Run the predict function on the testing set. ## If there were multiple testing sets, the same model fit could be used on ## each one without having to rerun the `BayesECM()` function. This ## functionality is more important when using training data with missing ## entries. bayes_pred <- predict(bayes_fit, Ytilde = testing, mixture_weights = mixture_weights) ## The "becm_desision()" function applies the decision theoretic framework ## to the training and testing data. For one training and one testing set, ## where the user wants to try different values of `alphatilde` and `C`, it is ## not necessary to rerun the `BayesECM()` function or the `predict()` ## function. becm_out <- becm_decision(bayes_pred = bayes_pred, alphatilde = alphatilde, vic = as.character(vic), cat_truth = test_truth, pn = TRUE, C = C01) ## Summarize and save the data. becm_out <- apply(becm_out$results,2, sum, na.rm = TRUE) exp_out[[p]][["becm"]][i,] <- unname(becm_out) ## Fit and save the B-ECM model that includes missing data bayes_fit_missing <- BayesECM(Y = rbind(train_full, train_missing), BT = BT, verb = verb) bayes_pred_missing <- predict(bayes_fit_missing, Ytilde = testing, thinning = thinning, mixture_weights = mixture_weights) missing_out <- becm_decision(bayes_pred = bayes_pred_missing, alphatilde = alphatilde, vic = as.character(vic), cat_truth = test_truth, pn = TRUE, C = C01) mbecm_out <- apply(missing_out$results,2, sum, na.rm = TRUE) exp_out[[p]][["mbecm"]][i,] <- unname(mbecm_out) ## The rest of the B-ECM variants are different through decision theory, ## not the model fit. All use partial observations for training. ## Note that the rej argument is supplied to becm_decision to reduce computation time ## Record the decision when the loss matrix is adjusted to target ## false negatives. Cfn_out <- becm_decision(bayes_pred = bayes_pred_missing, alphatilde = alphatilde, vic = as.character(vic), cat_truth = test_truth, pn = TRUE, C = Cfneg, rej = missing_out$rej) becm_Cfn_out <- apply(Cfn_out$results,2, sum, na.rm = TRUE) exp_out[[p]][["mbecm_Cfn"]][i,] <- unname(becm_Cfn_out) ## Record the decisions when full class (K = 3) categorization is used ## instead of binary categorization cat_out <- becm_decision(bayes_pred = bayes_pred_missing, alphatilde = alphatilde, vic = as.character(vic), cat_truth = test_truth, pn = TRUE, C = Ccat, rej = missing_out$rej) becm_cat_out <- apply(cat_out$results,2, sum, na.rm = TRUE) exp_out[[p]][["mbecm_cat"]][i,] <- unname(becm_cat_out) } } ## ----------------------------------------------------------------------------- ECM_boxplot <- function(exp_out, P = P, models = c("cECM", "becm", "mbecm", "mbecm_Cfn", "mbecm_cat"), cols = NULL, legend_text = c("C-ECM", "B-ECM", "M-B-ECM", bquote("M-B-ECM, " * C["1,2"] == 2), "M-B-ECM Cat"), metric = "accurate"){ if(metric == "accurate"){ divisor <- function(exp_out, p){ return(exp_out[[p]]$data$Ntest) } ylab <- "Model Accuracy" }else if(metric == "FN"){ divisor <- function(exp_out,p){ return(exp_out[[p]]$data$Nvic) } ylab <- "False Negative Rate" }else if(metric == "FP"){ divisor <- function(exp_out, p){ return(exp_out[[p]]$data$Ntest - exp_out[[p]]$data$Nvic) } ylab <- "False Positive Rate" }else{ stop("Argument 'metric' must be one of the following case sensitive character strings: 'accurate', 'FN', or 'FP'.") } boxplotdf <- do.call("cbind", lapply(exp_out[[P[1]]][models], function(X, m = metric){ X[[m]] }))/divisor(exp_out, p = P[1]) for(p in P[-1]){ boxplotdf <- cbind(boxplotdf,do.call("cbind", lapply(exp_out[[p]][models], function(X, m = metric){ X[[m]] }))/divisor(exp_out, p = p)) } boxplotdf <- boxplotdf if(max(boxplotdf) > 65){ ylim <- c(min(boxplotdf), 1.1) }else{ ylim <- range(boxplotdf) * c(0.9,1.2) } if(is.null(cols)){ if(length(models) == 5){ pltcols <- hcl.colors(44, palette = "viridis", rev = TRUE)[c(3, 10, 20, 30, 37)] }else if(length(models) > 10){ warning("You should consider recoding this function with a different way to select the colors used for the plot.") pltcols <- hcl.colors(44, palette = "viridis", rev = TRUE)[seq(from = 2, to = 38, length.out = length(models))] }else{ pltcols <- hcl.colors(44, palette = "viridis", rev = TRUE)[seq(from = 2, to = 38, length.out = length(models))] } }else{ if(length(cols) != length(models)){ stop("If supplying colors, a vector the same length as the 'models' argument must be used.") } } opar <- par(no.readonly = TRUE) on.exit(expr = suppressWarnings(par(opar))) par(mar = c(4.25,3.85,1,0.5)) lmodels <- length(models) graphics::boxplot(boxplotdf, at = (1:((lmodels + 1)*length(P)))[-seq(from = lmodels + 1, to = length(P)*(lmodels + 1), by = lmodels + 1)], xaxt = "n", yaxt = "n", ylim = ylim, col = pltcols, xlab = "Number of Discriminants", ylab = ylab) py <- pretty(boxplotdf) graphics::axis(2, py) graphics::axis(1, at =seq(from = 1, to = length(P)*(lmodels + 1), by = lmodels + 1) + 1, labels = paste0("p = ", P) ) graphics::legend("topleft", bty ="n", legend = legend_text, fill = pltcols, horiz = FALSE, ncol = 3, y.intersp = 1.25) } ## ----------------------------------------------------------------------------- ECM_boxplot(exp_out = exp_out, P = P, metric = "accurate") ## ----------------------------------------------------------------------------- ECM_boxplot(exp_out = exp_out, P = P, metric = "FN")