## ----setup, include=FALSE----------------------------------------------------- source("setup.R") rdoc_url <- function(names, ...) names ## ----overview_modelinfo, echo=FALSE, message=FALSE---------------------------- library(MachineShop) info <- modelinfo() ## ----overview_install, eval = FALSE------------------------------------------- # # Current release from CRAN # install.packages("MachineShop") # # # Development version from GitHub # # install.packages("devtools") # devtools::install_github("brian-j-smith/MachineShop") # # # Development version with vignettes # devtools::install_github("brian-j-smith/MachineShop", build_vignettes = TRUE) ## ----overview_docs, eval = FALSE, message = FALSE----------------------------- # library(MachineShop) # # # Package help summary # ?MachineShop # # # Vignette # RShowDoc("UserGuide", package = "MachineShop") ## ----using_example_melanoma--------------------------------------------------- ## Analysis libraries and dataset library(MachineShop) library(survival) data(Melanoma, package = "MASS") ## Malignant melanoma analysis dataset surv_df <- within(Melanoma, { y <- Surv(time, status != 2) remove(time, status) }) ## ----using_example_summary, echo=FALSE---------------------------------------- surv_stats <- list( list("Number of subjects" = ~ length(y)), "time" = list("Median (Range)" = ~ median_range(y[, "time"])), "status" = list("1 = Dead" = ~ n_perc(y[, "status"] == 1), "0 = Alive" = ~ n_perc(y[, "status"] == 0)), "sex" = list("1 = Male" = ~ n_perc(sex == 1), "0 = Female" = ~ n_perc(sex == 0)), "age" = list("Median (Range)" = ~ median_range(age)), "year" = list("Median (Range)" = ~ median_range(year)), "thickness" = list("Median (Range)" = ~ median_range(thickness)), "ulcer" = list("1 = Presence" = ~ n_perc(ulcer == 1), "0 = Absence" = ~ n_perc(ulcer == 0)) ) summary_kbl(surv_stats, surv_df) ## ----using_example_survfit, echo=FALSE---------------------------------------- library(ggplot2) col <- "#F8766D" survfit(y ~ 1, data = surv_df) %>% with(data.frame(time, surv, lower, upper, censor = ifelse(n.censor > 0, time, NA))) %>% ggplot(aes(x = time, y = surv)) + geom_ribbon(aes(ymin = lower, ymax = upper), fill = col, alpha = 0.2) + geom_step(color = col) + geom_point(aes(x = censor), shape = 3, color = col) + coord_cartesian(ylim = c(0, 1)) + labs(x = "Follow-Up Time (Days)", y = "Overall Survival Probability", title = "Kaplan-Meier survival plot") ## ----using_example_datasets--------------------------------------------------- ## Training and test sets set.seed(123) train_indices <- sample(nrow(surv_df), nrow(surv_df) * 2 / 3) surv_train <- surv_df[train_indices, ] surv_test <- surv_df[-train_indices, ] ## Global formula for the analysis surv_fo <- y ~ sex + age + year + thickness + ulcer ## ----using_fit_modelinfo------------------------------------------------------ ## All available models modelinfo() %>% names ## ----using_fit_modelinfo_gbmmodel--------------------------------------------- ## Model-specific information modelinfo(GBMModel) ## ----using_fit_gbmmodel------------------------------------------------------- GBMModel ## ----using_fit_modelinfo_type------------------------------------------------- ## All survival response-specific models modelinfo(Surv(0)) %>% names ## Identify survival response-specific models modelinfo(Surv(0), CoxModel, GBMModel, SVMModel) %>% names ## ----using_fit_modelinfo_response--------------------------------------------- ## Models for a responses variable modelinfo(surv_df$y) %>% names ## ----using_fit_function, results="hide"--------------------------------------- ## Generalized boosted regression fit ## Model function surv_fit <- fit(surv_fo, data = surv_train, model = GBMModel) ## Model function name fit(surv_fo, data = surv_train, model = "GBMModel") ## Model object fit(surv_fo, data = surv_train, model = GBMModel(n.trees = 100, interaction.depth = 1)) ## ----using_fit_dynamic, results="hide"---------------------------------------- ## Dynamic model parameter k = log number of observations ## Number of observations: nobs fit(surv_fo, data = surv_train, model = CoxStepAICModel(k = .(log(nobs)))) ## Response variable: y fit(surv_fo, data = surv_train, model = CoxStepAICModel(k = .(log(length(y))))) ## ----using_predict_function--------------------------------------------------- ## Predicted survival means (default: Weibull distribution) predict(surv_fit, newdata = surv_test) ## Predicted survival means (empirical distribution) predict(surv_fit, newdata = surv_test, distr = "empirical") ## ----using_predict_function_times--------------------------------------------- ## Predict survival probabilities and events at specified follow-up times surv_times <- 365 * c(5, 10) predict(surv_fit, newdata = surv_test, times = surv_times, type = "prob") predict(surv_fit, newdata = surv_test, times = surv_times, cutoff = 0.7) ## ----using_variables_formula-------------------------------------------------- ## Datasets data(Pima.te, package = "MASS") data(Pima.tr, package = "MASS") ## Formula specification model_fit <- fit(type ~ ., data = Pima.tr, model = GBMModel) predict(model_fit, newdata = Pima.te) %>% head ## ----using_variables_formula_RHS---------------------------------------------- settings("RHS.formula") ## ----using_variables_matrix--------------------------------------------------- ## Example design matrix and response object x <- model.matrix(type ~ . - 1, data = Pima.tr) y <- Pima.tr$type ## Design matrix specification model_fit <- fit(x, y, model = GBMModel) predict(model_fit, newdata = Pima.te) %>% head ## ----using_variables_modelframe----------------------------------------------- ## Model frame specification ## Formula mf <- ModelFrame(type ~ ., data = Pima.tr) model_fit <- fit(mf, model = GBMModel) predict(model_fit, newdata = Pima.te) %>% head ## Design matrix mf <- ModelFrame(x, y) model_fit <- fit(mf, model = GBMModel) predict(model_fit, newdata = Pima.te) %>% head ## ----using_variables_modelframe_weights, results="hide"----------------------- ## Model frame specification with case weights mf <- ModelFrame(ncases / (ncases + ncontrols) ~ agegp + tobgp + alcgp, data = esoph, weights = ncases + ncontrols) fit(mf, model = GBMModel) ## ----using_variables_recipe--------------------------------------------------- ## Recipe specification library(recipes) rec <- recipe(type ~ ., data = Pima.tr) model_fit <- fit(rec, model = GBMModel) predict(model_fit, newdata = Pima.te) %>% head ## ----using_variables_recipe_weights, results="hide"--------------------------- ## Recipe specification with case weights df <- within(esoph, { y <- ncases / (ncases + ncontrols) weights <- ncases + ncontrols remove(ncases, ncontrols) }) rec <- recipe(y ~ agegp + tobgp + alcgp + weights, data = df) %>% role_case(weight = weights, replace = TRUE) %>% step_ordinalscore(agegp, tobgp, alcgp) fit(rec, model = GBMModel) ## ----using_variables_summary, echo=FALSE-------------------------------------- df <- data.frame( "Specification" = c("Traditional Formula", "Design Matrix", "Traditional Formula", "Design Matrix", "Recipe"), "Preprocessing" = factor(c("manual", "manual", "manual", "manual", "automatic"), levels = c("manual", "automatic")), "In-line Functions" = factor(c("yes", "no", "yes", "no", "no"), levels = c("no", "yes")), "Case Weights" = factor(c("equal", "equal", "user", "user", "user"), levels = c("equal", "user")), "Resampling Strata" = factor(c("response", "response", "user", "user", "user"), levels = c("response", "user")), "Computational Overhead" = factor(c("medium", "low", "medium", "low", "high"), levels = c("high", "medium", "low")), check.names = FALSE ) bg_colors <- c("orange", "blue", "green") df[-1] <- lapply(df[-1], function(x) { bg_colors <- if (nlevels(x) == 2) bg_colors[c(1, 3)] else bg_colors cell_spec(x, color = "white", background = bg_colors[x]) }) kable(df, align = c("l", rep("c", ncol(df) - 1)), escape = FALSE, caption = "Table 2. Characteristics of available variable specification approaches.") %>% kable_styling(c("striped", "condensed"), full_width = FALSE, position = "center") %>% column_spec(1, bold = TRUE) %>% kableExtra::group_rows("Model Frame", 3, 4) ## ----using_responses_factor--------------------------------------------------- ## Iris flowers species (3-level factor) model_fit <- fit(Species ~ ., data = iris, model = GBMModel) predict(model_fit) %>% head predict(model_fit, type = "prob") %>% head ## ----using_responses_factor_binary-------------------------------------------- ## Pima Indians diabetes statuses (binary factor) data(Pima.te, package = "MASS") data(Pima.tr, package = "MASS") model_fit <- fit(type ~ ., data = Pima.tr, model = GBMModel) predict(model_fit, newdata = Pima.te) %>% head predict(model_fit, newdata = Pima.te, type = "prob") %>% head ## ----using_responses_ordered-------------------------------------------------- ## Iowa City housing prices (ordered factor) df <- within(ICHomes, { sale_amount <- cut(sale_amount, breaks = 3, labels = c("Low", "Medium", "High"), ordered_result = TRUE) }) model_fit <- fit(sale_amount ~ ., data = df, model = GBMModel) predict(model_fit) %>% head predict(model_fit, type = "prob") %>% head ## ----using_responses_numeric-------------------------------------------------- ## Iowa City housing prices model_fit <- fit(sale_amount ~ ., data = ICHomes, model = GBMModel) predict(model_fit) %>% head predict(model_fit, type = "numeric") %>% head ## ----using_responses_matrix--------------------------------------------------- ## Anscombe's multiple regression models dataset ## Numeric matrix response formula model_fit <- fit(cbind(y1, y2, y3) ~ x1, data = anscombe, model = LMModel) predict(model_fit) %>% head ## ----using_responses_matrix_recipe-------------------------------------------- ## Numeric matrix response recipe ## Defined in a recipe formula rec <- recipe(y1 + y2 + y3 ~ x1, data = anscombe) ## Defined within a data frame df <- within(anscombe, { y <- cbind(y1, y2, y3) remove(y1, y2, y3) }) rec <- recipe(y ~ x1, data = df) model_fit <- fit(rec, model = LMModel) predict(model_fit) %>% head ## ----using_responses_surv, results="hide"------------------------------------- ## Survival response formula library(survival) fit(Surv(time, status) ~ ., data = veteran, model = GBMModel) ## ----using_responses_surv_recipe, results="hide"------------------------------ ## Survival response recipe ## Defined in a recipe formula rec <- recipe(time + status ~ ., data = veteran) %>% role_surv(time = time, event = status) ## Defined within a data frame df <- within(veteran, { y <- Surv(time, status) remove(time, status) }) rec <- recipe(y ~ ., data = df) fit(rec, model = GBMModel) ## ----using_performance_function----------------------------------------------- ## Survival performance metrics ## Observed responses obs <- response(surv_fit, newdata = surv_test) ## Predicted survival means pred_means <- predict(surv_fit, newdata = surv_test) performance(obs, pred_means) ## Predicted survival probabilities pred_probs <- predict(surv_fit, newdata = surv_test, times = surv_times, type = "prob") performance(obs, pred_probs) ## Predicted survival events pred_events <- predict(surv_fit, newdata = surv_test, times = surv_times) performance(obs, pred_events) ## ----using_performance_function_metrics, eval=FALSE--------------------------- # ## Single metric function # performance(obs, pred_means, metrics = cindex) # # ## Single metric function name # performance(obs, pred_means, metrics = "cindex") # # ## List of metric functions # performance(obs, pred_means, metrics = c(cindex, rmse, rmsle)) # # ## Named list of metric functions # performance(obs, pred_means, # metrics = c("CIndex" = cindex, "RMSE" = rmse, "RMSLE" = rmsle)) ## ----using_performance_function_cutoff---------------------------------------- ## User-specified survival probability metrics performance(obs, pred_probs, metrics = c(sensitivity, specificity), cutoff = 0.7) ## ----using_metrics_functions-------------------------------------------------- ## Metric functions for survival means cindex(obs, pred_means) rmse(obs, pred_means) rmsle(obs, pred_means) ## Metric functions for survival probabilities sensitivity(obs, pred_probs) specificity(obs, pred_probs) ## ----using_metrics_metricinfo------------------------------------------------- ## All available metrics metricinfo() %>% names ## ----using_metrics_metricinfo_cindex------------------------------------------ ## Metric-specific information metricinfo(cindex) ## ----using_metrics_cindex----------------------------------------------------- cindex ## ----using_metrics_metricinfo_type-------------------------------------------- ## Metrics for observed and predicted response variable types metricinfo(Surv(0)) %>% names metricinfo(Surv(0), numeric(0)) %>% names metricinfo(Surv(0), SurvEvents(0)) %>% names metricinfo(Surv(0), SurvProbs(0)) %>% names ## Identify survival-specific metrics metricinfo(Surv(0), auc, cross_entropy, cindex) %>% names ## ----using_metrics_metricinfo_response---------------------------------------- ## Metrics for observed and predicted responses from model fits metricinfo(obs, pred_means) %>% names metricinfo(obs, pred_probs) %>% names ## ----using_metrics_conf, echo=FALSE------------------------------------------- conf <- matrix(c("True Negative (TN)", "False Positive (FP)", "False Negative (FN)", "True Positive (TP)"), 2, 2, dimnames = list("Predicted Response" = c("Negative", "Positive"), "Observed Response" = c("Negative", "Positive"))) kable(conf, caption = "Table 4. Confusion matrix of observed and predicted response classifications.", align = c("c", "c")) %>% kable_styling(full_width = FALSE, position = "center") %>% add_header_above(c("Predicted Response" = 1, "Observed Response" = 2)) ## ----using_metrics_conf_surv, echo=FALSE-------------------------------------- conf <- matrix( c( "$TN = \\Pr(\\hat{S}(t) \\ge \\text{cutoff} \\cap T \\gt t)$", "$FP = \\Pr(\\hat{S}(t) \\lt \\text{cutoff} \\cap T \\gt t)$", "$FN = \\Pr(\\hat{S}(t) \\ge \\text{cutoff} \\cap T \\le t)$", "$TP = \\Pr(\\hat{S}(t) \\lt \\text{cutoff} \\cap T \\le t)$" ), 2, 2, dimnames = list( "Predicted Response" = c("Non-Event", "Event"), "Observed Response" = c("Non-Event", "Event") ) ) kable(conf, caption = "Table 5. Confusion matrix of observed and predicted survival response classifications.", align = c("c", "c"), escape = FALSE) %>% kable_styling(full_width = FALSE, position = "center") %>% add_header_above(c("Predicted Response" = 1, "Observed Response" = 2)) ## ----using_resample_control--------------------------------------------------- ## Control structures for K-fold cross-validation ## Prediction of survival means surv_means_control <- CVControl(folds = 5, repeats = 3, seed = 123) ## Prediction of survival probabilities surv_probs_control <- CVControl(folds = 5, repeats = 3, seed = 123) %>% set_predict(times = surv_times) ## User-specification of the default control structure MachineShop::settings(control = CVControl(folds = 5, seed = 123)) ## Package default # MachineShop::settings(reset = "control") ## ----using_resample_parallel-------------------------------------------------- ## Register multiple cores for parallel computations library(doParallel) registerDoParallel(cores = 2) ## ----using_resample_function-------------------------------------------------- ## Resample estimation for survival means and probabilities (res_means <- resample(surv_fo, data = surv_train, model = GBMModel, control = surv_means_control)) (res_probs <- resample(surv_fo, data = surv_train, model = GBMModel, control = surv_probs_control)) ## ----using_resample_summary--------------------------------------------------- ## Summary of survival means metric summary(res_means) ## Summary of survival probability metrics summary(res_probs) ## ----using_resample_summary_performance--------------------------------------- ## Resample-specific metrics metricinfo(res_means) %>% names ## User-specified survival means metrics summary(performance(res_means, metrics = c(cindex, rmse))) ## ----using_resample_summary_stats--------------------------------------------- ## User-defined statistics function percentiles <- function(x) quantile(x, probs = c(0.25, 0.50, 0.75)) summary(res_means, stats = percentiles) ## User-defined list of statistics functions summary(res_means, stats = c(Mean = mean, Percentile = percentiles)) ## ----using_resample_plots----------------------------------------------------- ## Libraries for plot annotation and fomatting library(ggplot2) library(gridExtra) ## Individual ggplots p1 <- plot(res_means) p2 <- plot(res_means, type = "density") p3 <- plot(res_means, type = "errorbar") p4 <- plot(res_means, type = "violin") ## Grid of plots grid.arrange(p1, p2, p3, p4, nrow = 2) ## ----using_resample_strata, results="hide"------------------------------------ ## Model frame with response variable stratification mf <- ModelFrame(surv_fo, data = surv_train, strata = surv_train$y) resample(mf, model = GBMModel) ## Recipe with response variable stratification rec <- recipe(y ~ ., data = surv_train) %>% role_case(stratum = y) resample(rec, model = GBMModel) ## ----using_resample_dynamic, results="hide"----------------------------------- ## Dynamic model parameter k = log number of training set observations resample(surv_fo, data = surv_train, model = CoxStepAICModel(k = .(log(nobs)))) ## ----using_resample_comparisons----------------------------------------------- ## Resample estimation res1 <- resample(surv_fo, data = surv_train, model = GBMModel(n.trees = 25), control = surv_means_control) res2 <- resample(surv_fo, data = surv_train, model = GBMModel(n.trees = 50), control = surv_means_control) res3 <- resample(surv_fo, data = surv_train, model = GBMModel(n.trees = 100), control = surv_means_control) ## Combine resample output for comparison (res <- c(GBM1 = res1, GBM2 = res2, GBM3 = res3)) summary(res) plot(res) ## ----using_resample_diff------------------------------------------------------ ## Pairwise model comparisons (res_diff <- diff(res)) summary(res_diff) plot(res_diff) ## ----using_resample_diff_test------------------------------------------------- t.test(res_diff) ## ----using_analyses_vi-------------------------------------------------------- ## Predictor variable importance (vi <- varimp(surv_fit, method = "model")) plot(vi) ## ----using_analysis_vi_info--------------------------------------------------- SVMModel modelinfo(SVMModel)[[1]]$varimp ## ----using_analysis_vi_permute------------------------------------------------ ## Permutation-based variable importance varimp(surv_fit) ## ----using_analysis_rfe------------------------------------------------------- ## Recursive feature elimination (surv_rfe <- rfe(surv_fo, data = surv_train, model = GBMModel, control = surv_means_control)) rfe_summary <- summary(surv_rfe) rfe_summary$terms[rfe_summary$selected] ## ----using_analyses_pd, results = "hide"-------------------------------------- ## Partial dependence plots pd <- dependence(surv_fit, select = c(thickness, age)) plot(pd) ## ----using_analyses_pd_data, results = "hide"--------------------------------- pd <- dependence(surv_fit, data = surv_test, select = thickness, n = 20, intervals = "quantile") plot(pd) ## ----using_analyses_cal, results="hide"--------------------------------------- ## Binned calibration curves cal <- calibration(res_probs, breaks = 10) plot(cal, se = TRUE) ## ----using_analyses_cal_smoothed, results="hide"------------------------------ ## Smoothed calibration curves cal <- calibration(res_probs, breaks = NULL) plot(cal) ## ----using_analyses_conf------------------------------------------------------ ## Confusion matrices (conf <- confusion(res_probs, cutoff = 0.7)) ## ----using_analyses_conf_plot, results="hide"--------------------------------- plot(conf) ## ----using_analyses_conf_summary---------------------------------------------- ## Summary performance metrics summary(conf) ## ----using_analyses_conf_performance------------------------------------------ ## Confusion matrix-specific metrics metricinfo(conf) %>% names ## User-specified metrics performance(conf, metrics = c("Accuracy" = accuracy, "Sensitivity" = sensitivity, "Specificity" = specificity)) ## ----using_analyses_roc------------------------------------------------------- ## ROC curves roc <- performance_curve(res_probs) plot(roc, diagonal = TRUE) ## ----using_analyses_roc_cutoffs----------------------------------------------- plot(roc, type = "cutoffs") ## ----using_analyses_roc_auc--------------------------------------------------- auc(roc) ## ----using_analyses_pr-------------------------------------------------------- ## Precision recall curves pr <- performance_curve(res_probs, metrics = c(precision, recall)) plot(pr) ## ----using_analyses_pr_auc---------------------------------------------------- auc(pr) ## ----using_analyses_lift------------------------------------------------------ ## Lift curves lf <- lift(res_probs) plot(lf, find = 0.75) ## ----using_stategies_TunedInput1, eval=FALSE---------------------------------- # ## Preprocessing recipe with PCA steps # pca_rec <- recipe(y ~ ., data = surv_train) %>% # role_case(stratum = y) %>% # step_center(all_predictors()) %>% # step_scale(all_predictors()) %>% # step_pca(all_predictors(), id = "PCA") # # ## Tuning grid of number of PCA components # pca_grid <- expand_steps( # PCA = list(num_comp = 1:3) # ) # # ## Tuning specification # tun_rec <- TunedInput(pca_rec, grid = pca_grid) ## ----using_stategies_TunedInput2, eval=FALSE---------------------------------- # ## Input-tuned model fit and final trained model # model_fit <- fit(tun_rec, model = GBMModel) # as.MLModel(model_fit) # #> --- MLModel object ---------------------------------------------------------- # #> # #> Model name: GBMModel # #> Label: Trained Generalized Boosted Regression # #> Package: gbm # #> Response types: factor, numeric, PoissonVariate, Surv # #> Case weights support: TRUE # #> Missing case removal: response # #> Tuning grid: TRUE # #> Variable importance: TRUE # #> # #> Parameters: # #> List of 5 # #> $ n.trees : num 100 # #> $ interaction.depth: num 1 # #> $ n.minobsinnode : num 10 # #> $ shrinkage : num 0.1 # #> $ bag.fraction : num 0.5 # #> # #> === $TrainingStep1 ========================================================== # #> === TrainingStep object === # #> # #> Optimization method: Grid Search # #> TunedModelRecipe log: # #> # A tibble: 3 × 4 # #> name selected params$PCA$num_comp metrics$`C-Index` # #> # #> 1 ModelRecipe.1 TRUE 1 0.740 # #> 2 ModelRecipe.2 FALSE 2 0.720 # #> 3 ModelRecipe.3 FALSE 3 0.725 # #> # #> Selected row: 1 # #> Metric: C-Index = 0.7399641 ## ----using_strategies_SelectedInput1, eval=FALSE------------------------------ # ## Preprocessing recipe without PCA steps # rec1 <- recipe(y ~ sex + age + year + thickness + ulcer, data = surv_train) %>% # role_case(stratum = y) # rec2 <- recipe(y ~ sex + age + year, data = surv_train) %>% # role_case(stratum = y) # # ## Selection among recipes with and without PCA steps # sel_rec <- SelectedInput( # rec1, # rec2, # TunedInput(pca_rec, grid = pca_grid) # ) ## ----using_strategies_SelectedInput2, eval=FALSE------------------------------ # ## Input-selected model fit and model # model_fit <- fit(sel_rec, model = GBMModel) # as.MLModel(model_fit) # #> --- MLModel object ---------------------------------------------------------- # #> # #> Model name: GBMModel # #> Label: Trained Generalized Boosted Regression # #> Package: gbm # #> Response types: factor, numeric, PoissonVariate, Surv # #> Case weights support: TRUE # #> Missing case removal: response # #> Tuning grid: TRUE # #> Variable importance: TRUE # #> # #> Parameters: # #> List of 5 # #> $ n.trees : num 100 # #> $ interaction.depth: num 1 # #> $ n.minobsinnode : num 10 # #> $ shrinkage : num 0.1 # #> $ bag.fraction : num 0.5 # #> # #> === $TrainingStep1 ========================================================== # #> === TrainingStep object === # #> # #> Optimization method: Grid Search # #> SelectedModelRecipe log: # #> # A tibble: 3 × 4 # #> name selected params$id metrics$`C-Index` # #> # #> 1 ModelRecipe.1 FALSE input.azpi 0.761 # #> 2 ModelRecipe.2 FALSE input.aLHo 0.643 # #> 3 TunedModelRecipe TRUE input.W4KN 0.796 # #> # #> Selected row: 3 # #> Metric: C-Index = 0.7960841 # #> # #> === $TrainingStep2 ========================================================== # #> === TrainingStep object === # #> # #> Optimization method: Grid Search # #> TunedModelRecipe log: # #> # A tibble: 3 × 4 # #> name selected params$PCA$num_comp metrics$`C-Index` # #> # #> 1 ModelRecipe.1 TRUE 1 0.740 # #> 2 ModelRecipe.2 FALSE 2 0.720 # #> 3 ModelRecipe.3 FALSE 3 0.725 # #> # #> Selected row: 1 # #> Metric: C-Index = 0.7399641 ## ----using_strategies_SelectedInput3, eval=FALSE------------------------------ # ## Traditional formulas # fo1 <- y ~ sex + age + year + thickness + ulcer # fo2 <- y ~ sex + age + year # # ## Selection among formulas # sel_fo <- SelectedInput(fo1, fo2, data = surv_train) # # ## Input-selected model fit and final trained model # model_fit <- fit(sel_fo, model = GBMModel) # as.MLModel(model_fit) ## ----using_strategies_SelectedInput4, eval=FALSE------------------------------ # ## Different combinations of inputs and models # sel_mfo <- SelectedInput( # ModelSpecification(fo1, data = surv_train, model = CoxModel), # ModelSpecification(fo2, data = surv_train, model = GBMModel) # ) # # ## Input-selected model fit and final trained model # model_fit <- fit(sel_mfo) # as.MLModel(model_fit) ## ----using_strategies_tune, eval=FALSE---------------------------------------- # ## Tune over automatic grid of model parameters # model_fit <- fit(surv_fo, data = surv_train, # model = TunedModel( # GBMModel, # grid = 3, # control = surv_means_control, # metrics = c("CIndex" = cindex, "RMSE" = rmse) # )) # (trained_model <- as.MLModel(model_fit)) # #> --- MLModel object ---------------------------------------------------------- # #> # #> Model name: GBMModel # #> Label: Trained Generalized Boosted Regression # #> Package: gbm # #> Response types: factor, numeric, PoissonVariate, Surv # #> Case weights support: TRUE # #> Missing case removal: response # #> Tuning grid: TRUE # #> Variable importance: TRUE # #> # #> Parameters: # #> List of 5 # #> $ n.trees : int 50 # #> $ interaction.depth: int 1 # #> $ n.minobsinnode : num 10 # #> $ shrinkage : num 0.1 # #> $ bag.fraction : num 0.5 # #> # #> === $TrainingStep1 ========================================================== # #> === TrainingStep object === # #> # #> Optimization method: Grid Search # #> TunedModel log: # #> # A tibble: 9 × 4 # #> name selected params$n.trees $interaction.depth metrics$CIndex $RMSE # #> # #> 1 GBMModel.1 TRUE 50 1 0.765 3869. # #> 2 GBMModel.2 FALSE 50 2 0.750 5450. # #> 3 GBMModel.3 FALSE 50 3 0.739 7877. # #> 4 GBMModel.4 FALSE 100 1 0.746 3919. # #> 5 GBMModel.5 FALSE 100 2 0.730 5262. # #> 6 GBMModel.6 FALSE 100 3 0.720 10596. # #> 7 GBMModel.7 FALSE 150 1 0.726 4167. # #> 8 GBMModel.8 FALSE 150 2 0.712 5428. # #> 9 GBMModel.9 FALSE 150 3 0.699 16942. # #> # #> Selected row: 1 # #> Metric: CIndex = 0.7647633 ## ----using_strategies_tune_grid, eval=FALSE----------------------------------- # ## Tune over randomly sampled grid points # fit(surv_fo, data = surv_train, # model = TunedModel( # GBMModel, # grid = TuningGrid(size = 100, random = 10), # control = surv_means_control # )) # # ## Tune over user-specified grid points # fit(surv_fo, data = surv_train, # model = TunedModel( # GBMModel, # grid = expand_params(n.trees = c(25, 50, 100), # interaction.depth = 1:3), # control = surv_means_control # )) ## ----using_strategies_tune_summary, eval=FALSE-------------------------------- # summary(trained_model) # #> --- $TrainingStep1 ---------------------------------------------------------- # #> # A tibble: 9 × 4 # #> name selected params$n.trees $interaction.depth metrics$CIndex $RMSE # #> # #> 1 GBMModel.1 TRUE 50 1 0.765 3869. # #> 2 GBMModel.2 FALSE 50 2 0.750 5450. # #> 3 GBMModel.3 FALSE 50 3 0.739 7877. # #> 4 GBMModel.4 FALSE 100 1 0.746 3919. # #> 5 GBMModel.5 FALSE 100 2 0.730 5262. # #> 6 GBMModel.6 FALSE 100 3 0.720 10596. # #> 7 GBMModel.7 FALSE 150 1 0.726 4167. # #> 8 GBMModel.8 FALSE 150 2 0.712 5428. # #> 9 GBMModel.9 FALSE 150 3 0.699 16942. # # performance(trained_model) %>% lapply(summary) # #> $TrainingStep1 # #> , , Metric = CIndex # #> # #> Statistic # #> Model Mean Median SD Min Max NA # #> GBMModel.1 0.7647633 0.7630332 0.05151443 0.6497797 0.8632075 0 # #> GBMModel.2 0.7502765 0.7549020 0.04636802 0.6688742 0.8349057 0 # #> GBMModel.3 0.7388766 0.7500000 0.04594197 0.6621622 0.8197425 0 # #> GBMModel.4 0.7463392 0.7601810 0.05087334 0.6497797 0.8160377 0 # #> GBMModel.5 0.7301734 0.7345133 0.04361416 0.6621622 0.7847222 0 # #> GBMModel.6 0.7199483 0.7122642 0.05168649 0.6216216 0.7889908 0 # #> GBMModel.7 0.7263693 0.7385321 0.05069831 0.6351351 0.7896996 0 # #> GBMModel.8 0.7121314 0.7169811 0.05331152 0.6199095 0.7847222 0 # #> GBMModel.9 0.6994678 0.7156863 0.05314909 0.6081081 0.7639485 0 # #> # #> , , Metric = RMSE # #> # #> Statistic # #> Model Mean Median SD Min Max NA # #> GBMModel.1 3868.629 3321.249 1431.839 2123.875 7401.789 0 # #> GBMModel.2 5450.093 4741.129 2438.173 2385.426 11695.273 0 # #> GBMModel.3 7877.487 5827.739 5036.837 4192.463 20009.854 0 # #> GBMModel.4 3919.126 3535.514 1454.999 2373.413 7558.280 0 # #> GBMModel.5 5262.122 5052.969 1900.353 2280.193 8212.727 0 # #> GBMModel.6 10595.888 8308.733 7057.327 4996.420 28610.367 0 # #> GBMModel.7 4167.128 3789.596 1115.371 2587.440 6701.087 0 # #> GBMModel.8 5427.518 6129.309 2313.742 2039.894 9212.369 0 # #> GBMModel.9 16941.696 10865.160 16841.774 4914.743 71150.177 0 ## ----using_strategies_tune_plot, eval=FALSE----------------------------------- # plot(trained_model, type = "line") # #> $TrainStep1 ## ----using_strategies_tune_png, echo=FALSE------------------------------------ knitr::include_graphics("img/using_strategies_tune_plot-1.png") ## ----using_strategies_select, results="hide", eval=FALSE---------------------- # ## Model interface for model selection # sel_model <- SelectedModel( # expand_model(GBMModel, n.trees = c(50, 100), interaction.depth = 1:2), # GLMNetModel(lambda = 0.01), # CoxModel, # SurvRegModel # ) # # ## Fit the selected model # fit(surv_fo, data = surv_train, model = sel_model) ## ----using_strategies_select_tune, results="hide", eval=FALSE----------------- # ## Model interface for selection among tuned models # sel_tun_model <- SelectedModel( # TunedModel(GBMModel, control = surv_means_control), # TunedModel(GLMNetModel, control = surv_means_control), # TunedModel(CoxModel, control = surv_means_control) # ) # # ## Fit the selected tuned model # fit(surv_fo, data = surv_train, model = sel_tun_model) ## ----using_strategies_ensembles, eval=FALSE----------------------------------- # ## Stacked regression # stackedmodel <- StackedModel(CoxModel, CForestModel, GLMBoostModel) # res_stacked <- resample(surv_fo, data = surv_train, model = stackedmodel) # summary(res_stacked) # #> Statistic # #> Metric Mean Median SD Min Max NA # #> C-Index 0.7294869 0.762963 0.1275484 0.5194805 0.8556701 0 # # ## Super learner # supermodel <- SuperModel(CoxModel, CForestModel, GLMBoostModel, # model = GBMModel) # res_super <- resample(surv_fo, data = surv_train, model = supermodel) # summary(res_super) # #> Statistic # #> Metric Mean Median SD Min Max NA # #> C-Index 0.7534803 0.8325472 0.1243104 0.5748899 0.8454545 0 ## ----using_strategies_methods, eval=FALSE------------------------------------- # ## Preprocessing recipe with PCA steps # pca_rec <- recipe(y ~ ., data = surv_train) %>% # role_case(stratum = y) %>% # step_center(all_predictors()) %>% # step_scale(all_predictors()) %>% # step_pca(all_predictors(), id = "PCA") # # ## Tuning grid of number of PCA components # pca_grid <- expand_steps( # PCA = list(num_comp = 1:3) # ) # # ## Input specification # tun_rec <- TunedInput(pca_rec, grid = pca_grid) # # ## Model specification # sel_model <- SelectedModel( # GBMModel, # TunedModel(GBMModel), # StackedModel(CoxModel, TunedModel(CForestModel), TunedModel(GBMModel)) # ) # # ## Model fit and final trained model # model_fit <- fit(tun_rec, model = sel_model) # as.MLModel(model_fit) ## ----using_strategies_dag, echo = FALSE, out.width = "100%"------------------- knitr::include_graphics("img/FigModelDAG.png") ## ----using_strategies_nestedcv, echo = FALSE, out.width = "100%"-------------- knitr::include_graphics("img/FigNestedCV.png") ## ----using_strategies_methods1, eval=FALSE------------------------------------ # #> === $TrainingStep1 ========================================================== # #> === TrainingStep object === # #> # #> Optimization method: Grid Search # #> TunedModelRecipe log: # #> # A tibble: 3 × 4 # #> name selected params$PCA$num_comp metrics$`C-Index` # #> # #> 1 ModelRecipe.1 TRUE 1 0.741 # #> 2 ModelRecipe.2 FALSE 2 0.730 # #> 3 ModelRecipe.3 FALSE 3 0.702 # #> # #> Selected row: 1 # #> Metric: C-Index = 0.7405534 ## ----using_strategies_methods2, eval=FALSE------------------------------------ # #> === $TrainingStep2 ========================================================== # #> === TrainingStep object === # #> # #> Optimization method: Grid Search # #> SelectedModel log: # #> # A tibble: 3 × 4 # #> name selected params$id metrics$`C-Index` # #> # #> 1 GBMModel TRUE model.1mQk 0.740 # #> 2 TunedModel FALSE model.hRF5 0.735 # #> 3 StackedModel FALSE model.CYj0 0.653 # #> # #> Selected row: 1 # #> Metric: C-Index = 0.7399641 ## ----using_strategies_methods0, eval=FALSE------------------------------------ # #> --- MLModel object ---------------------------------------------------------- # #> # #> Model name: GBMModel # #> Label: Trained Generalized Boosted Regression # #> Package: gbm # #> Response types: factor, numeric, PoissonVariate, Surv # #> Case weights support: TRUE # #> Missing case removal: response # #> Tuning grid: TRUE # #> Variable importance: TRUE # #> # #> Parameters: # #> List of 5 # #> $ n.trees : num 100 # #> $ interaction.depth: num 1 # #> $ n.minobsinnode : num 10 # #> $ shrinkage : num 0.1 # #> $ bag.fraction : num 0.5 ## ----eval=FALSE--------------------------------------------------------------- # ## Generalization performance of the modeling strategy # resample(tun_rec, model = sel_model) ## ----using_settings----------------------------------------------------------- ## Change settings presets <- settings(control = "BootControl", grid = 10) ## View one setting settings("control") ## View multiple settings settings("control", "grid") ## Restore the previous settings settings(presets) ## ----using_extensions_mlmodel------------------------------------------------- ## Logistic regression model extension LogisticModel <- MLModel( name = "LogisticModel", label = "Logistic Model", response_types = "binary", weights = TRUE, fit = function(formula, data, weights, ...) { glm(formula, data = as.data.frame(data), weights = weights, family = binomial, ...) }, predict = function(object, newdata, ...) { predict(object, newdata = as.data.frame(newdata), type = "response") }, varimp = function(object, ...) { pchisq(coef(object)^2 / diag(vcov(object)), 1) } ) ## ----using_extensions_mlmetric------------------------------------------------ ## F2 score metric extension f2_score <- MLMetric( function(observed, predicted, ...) { f_score(observed, predicted, beta = 2, ...) }, name = "f2_score", label = "F2 Score", maximize = TRUE ) ## ----using_extensions_usage--------------------------------------------------- ## Logistic regression analysis data(Pima.tr, package = "MASS") res <- resample(type ~ ., data = Pima.tr, model = LogisticModel) summary(performance(res, metric = f2_score)) ## ----reference_models, echo = FALSE------------------------------------------- library(MachineShop) info <- modelinfo() types <- c("binary" = "b", "factor" = "f", "matrix" = "m", "numeric" = "n", "ordered" = "o", "Surv" = "S") x <- lapply(names(info), function(modelname) { c(modelname, ifelse(names(types) %in% info[[modelname]]$response_types, types, NA)) }) df <- as.data.frame(do.call(rbind, x), stringsAsFactors = FALSE) names(df) <- c("Function", names(types)) rdoc_names <- sapply(df$Function, function(name) { switch(name, "CoxStepAICModel" = "CoxModel", "GLMStepAICModel" = "GLMModel", "PDAModel" = "FDAModel", "SurvRegStepAICModel" = "SurvRegModel", "SVMANOVAModel" = "SVMModel", "SVMBesselModel" = "SVMModel", "SVMLaplaceModel" = "SVMModel", "SVMLinearModel" = "SVMModel", "SVMPolyModel" = "SVMModel", "SVMRadialModel" = "SVMModel", "SVMSplineModel" = "SVMModel", "SVMTanhModel" = "SVMModel", "XGBDARTModel" = "XGBModel", "XGBLinearModel" = "XGBModel", "XGBTreeModel" = "XGBModel", name ) }) toString2 <- function(x) toString(na.omit(x)) df_classes <- data.frame( Function = rdoc_url(df$Function, rdoc_names), Label = sapply(info, getElement, name = "label"), Categorical = apply(df[c("binary", "factor", "ordered")], 1, toString2), Continuous = apply(df[c("matrix", "numeric")], 1, toString2), Survival = apply(df["Surv"], 1, toString2) ) names(df_classes)[3:5] <- paste0(names(df_classes)[3:5], footnote_marker_number(1:3)) kable(df_classes, caption = "Package-supplied model constructor functions and supported response variable types.", align = c("l", "l", "c", "c", "c"), row.names = FALSE, escape = FALSE) %>% kable_styling(c("striped", "condensed"), full_width = FALSE, position = "center") %>% add_header_above(c(" " = 1, " " = 1, "Response Variable Types" = 3)) %>% footnote(number = c("b = binary factor, f = factor, o = ordered factor", "m = matrix, n = numeric", "S = Surv")) ## ----reference_metrics, echo=FALSE-------------------------------------------- library(MachineShop) f <- function(x) { types <- x$response_types is_type <- function(observed, predicted) { any(types$observed == observed & types$predicted == predicted) } categorical <- if (is_type("factor", "matrix")) "f" else if (is_type("factor", "numeric")) "b" else NULL if (is_type("ordered", "ordered")) categorical <- c(categorical, "o") continuous <- NULL if (is_type("matrix", "matrix")) continuous <- "m" if (is_type("numeric", "numeric")) continuous <- c(continuous, "n") survival <- NULL if (any(mapply(is_type, "Surv", c("numeric", "SurvEvents", "SurvProbs")))) { survival <- "S" } data.frame( Label = x$label, Categorical = toString(categorical), Continuous = toString(continuous), Survival = toString(survival) ) } info <- metricinfo() df <- cbind("Function" = rdoc_url(names(info), "metrics"), do.call(rbind, lapply(info, f))) names(df)[3:5] <- paste0(names(df)[3:5], footnote_marker_number(1:3)) kable(df, caption = "Package-supplied performance metric functions and supported response variable types.", align = c("l", "l", "c", "c", "c"), row.names = FALSE, escape = FALSE) %>% kable_styling(c("striped", "condensed"), full_width = FALSE, position = "center") %>% add_header_above(c(" " = 1, " " = 1, "Response Variable Types" = 3)) %>% footnote(number = c("b = binary factor, f = factor, o = ordered factor", "m = matrix, n = numeric", "S = Surv"))