library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
df <- readr::read_rds("df.rds")
step_2_b_df <- readr::read_rds("step_2_b_df.rds")
step_2_a_df <- readr::read_rds("step_2_a_df.rds")
iii_models <- readr::read_rds("iii_models.rds")
iv_models <- readr::read_rds("iv_models.rds")
This R Markdown file tackles part v, specifically evaluating the results from parts iii and iv, identifying the most important variables associated with the best performing models, visualizing the probability of failure as a function of the identified most important variables, and determining, based on visualizations, the input settings associated with minimizing the failure probability.
response_1 as a featureFirst compare the results between the two best performing models from parts iii and iv, based on AUC.
results <- resamples(list(iii_rf = iii_models$iii_rf,
iii_xgb = iii_models$iii_xgb,
iv_rf = iv_models$iv_rf,
iv_xgb = iv_models$iv_xgb))
Then we visually compare the performance metrics.
dotplot(results)
dotplot(results, metric = "ROC")
dotplot(results, metric = "Sens")
dotplot(results, metric = "Spec")
Based on AUC, the models including response_1 as a feature yield better performance in predicting outcome_2, as observable from the higher ROC values for both iii_rf and iii_xgb.
calc_accuracy <- function(model) {
cf <- confusionMatrix.train(model)
return( (cf$table[1,1] + cf$table[2,2]) / 100 )
}
models <- list(iii_rf = iii_models$iii_rf,
iii_xgb = iii_models$iii_xgb,
iv_rf = iv_models$iv_rf,
iv_xgb = iv_models$iv_xgb)
accuracy_results <- purrr::map_dbl(models, calc_accuracy)
accuracy_results %>% sort(decreasing = TRUE)
## iv_rf iii_xgb iii_rf iv_xgb
## 0.7335320 0.7318430 0.7303527 0.7223050
Based on Accuracy, including response_1 as a feature yield slightly better performance for the best performing model iv_rf, but slightly worse for iv_xgb.
response_1 as a featureplot(varImp(iii_models$iii_rf))
Plot variable importance based on xgb model.
plot(varImp(iii_models$iii_xgb))
x07, x08 and response_1 seem to be the three most important inputs.
response_1 as a featureplot(varImp(iv_models$iv_rf))
Plot variable importance based on xgb model.
plot(varImp(iv_models$iv_xgb))
x07 and x08 seem to be the two most important inputs.
In general, inputs x07, x08 seem to be the most importance variables. response_1 is also an important variable for the models including response_1 as one of the features.
Fail probability against most important variableslibrary(pdp)
##
## Attaching package: 'pdp'
## The following object is masked from 'package:purrr':
##
## partial
# Custom prediction function wrapper
# pdp_pred <- function(object, newdata) {
# results <- mean(as.vector(predict(object, newdata)))
# return(results)
# }
# Compute partial dependence values
pd_values_x07 <- partial(
iii_models$iii_rf,
train = step_2_b_df,
pred.var = "x07"
)
head(pd_values_x07) # take a peak
## x07 yhat
## 1 140.3138 0.09749985
## 2 140.4843 0.09718262
## 3 140.6547 0.08530345
## 4 140.8251 0.08752993
## 5 140.9955 0.18662981
## 6 141.1660 0.20324699
# Partial dependence plot
autoplot(pd_values_x07)
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.
# Compute partial dependence values
pd_values_x08 <- partial(
iii_models$iii_rf,
train = step_2_b_df,
pred.var = "x08"
)
head(pd_values_x08) # take a peak
## x08 yhat
## 1 74.03725 0.1086069
## 2 74.21489 0.1235168
## 3 74.39253 0.1235168
## 4 74.57017 0.1264428
## 5 74.74781 0.1307514
## 6 74.92545 0.1931101
# Partial dependence plot
autoplot(pd_values_x08)
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.
Fail probabilityFind the optimal settings for x07 and x08 to minimize Fail probability, from partial dependence calculations.
pd_values_x07 %>% as_tibble() %>% filter(pd_values_x07$yhat == min(pd_values_x07$yhat)) %>% select(x07)
## # A tibble: 1 x 1
## x07
## <dbl>
## 1 144.
pd_values_x08 %>% as_tibble() %>% filter(pd_values_x08$yhat == min(pd_values_x08$yhat)) %>% select(x08)
## # A tibble: 1 x 1
## x08
## <dbl>
## 1 78.3