library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
df <- readr::read_rds("df.rds")
step_2_b_df <- readr::read_rds("step_2_b_df.rds")
step_2_a_df <- readr::read_rds("step_2_a_df.rds")
iii_models <- readr::read_rds("iii_models.rds")
iv_models <- readr::read_rds("iv_models.rds")
This R Markdown file tackles part v, specifically evaluating the results from parts iii and iv, identifying the most important variables associated with the best performing models, visualizing the probability of failure as a function of the identified most important variables, and determining, based on visualizations, the input settings associated with minimizing the failure probability.
response_1
as a featureFirst compare the results between the two best performing models from parts iii and iv, based on AUC.
results <- resamples(list(iii_rf = iii_models$iii_rf,
iii_xgb = iii_models$iii_xgb,
iv_rf = iv_models$iv_rf,
iv_xgb = iv_models$iv_xgb))
Then we visually compare the performance metrics.
dotplot(results)
dotplot(results, metric = "ROC")
dotplot(results, metric = "Sens")
dotplot(results, metric = "Spec")
Based on AUC, the models including response_1
as a feature yield better performance in predicting outcome_2
, as observable from the higher ROC
values for both iii_rf
and iii_xgb
.
calc_accuracy <- function(model) {
cf <- confusionMatrix.train(model)
return( (cf$table[1,1] + cf$table[2,2]) / 100 )
}
models <- list(iii_rf = iii_models$iii_rf,
iii_xgb = iii_models$iii_xgb,
iv_rf = iv_models$iv_rf,
iv_xgb = iv_models$iv_xgb)
accuracy_results <- purrr::map_dbl(models, calc_accuracy)
accuracy_results %>% sort(decreasing = TRUE)
## iv_rf iii_xgb iii_rf iv_xgb
## 0.7335320 0.7318430 0.7303527 0.7223050
Based on Accuracy, including response_1
as a feature yield slightly better performance for the best performing model iv_rf
, but slightly worse for iv_xgb
.
response_1
as a featureplot(varImp(iii_models$iii_rf))
Plot variable importance based on xgb
model.
plot(varImp(iii_models$iii_xgb))
x07
, x08
and response_1
seem to be the three most important inputs.
response_1
as a featureplot(varImp(iv_models$iv_rf))
Plot variable importance based on xgb
model.
plot(varImp(iv_models$iv_xgb))
x07
and x08
seem to be the two most important inputs.
In general, inputs x07
, x08
seem to be the most importance variables. response_1
is also an important variable for the models including response_1
as one of the features.
Fail
probability against most important variableslibrary(pdp)
##
## Attaching package: 'pdp'
## The following object is masked from 'package:purrr':
##
## partial
# Custom prediction function wrapper
# pdp_pred <- function(object, newdata) {
# results <- mean(as.vector(predict(object, newdata)))
# return(results)
# }
# Compute partial dependence values
pd_values_x07 <- partial(
iii_models$iii_rf,
train = step_2_b_df,
pred.var = "x07"
)
head(pd_values_x07) # take a peak
## x07 yhat
## 1 140.3138 0.09749985
## 2 140.4843 0.09718262
## 3 140.6547 0.08530345
## 4 140.8251 0.08752993
## 5 140.9955 0.18662981
## 6 141.1660 0.20324699
# Partial dependence plot
autoplot(pd_values_x07)
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.
# Compute partial dependence values
pd_values_x08 <- partial(
iii_models$iii_rf,
train = step_2_b_df,
pred.var = "x08"
)
head(pd_values_x08) # take a peak
## x08 yhat
## 1 74.03725 0.1086069
## 2 74.21489 0.1235168
## 3 74.39253 0.1235168
## 4 74.57017 0.1264428
## 5 74.74781 0.1307514
## 6 74.92545 0.1931101
# Partial dependence plot
autoplot(pd_values_x08)
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.
Fail
probabilityFind the optimal settings for x07
and x08
to minimize Fail
probability, from partial dependence calculations.
pd_values_x07 %>% as_tibble() %>% filter(pd_values_x07$yhat == min(pd_values_x07$yhat)) %>% select(x07)
## # A tibble: 1 x 1
## x07
## <dbl>
## 1 144.
pd_values_x08 %>% as_tibble() %>% filter(pd_values_x08$yhat == min(pd_values_x08$yhat)) %>% select(x08)
## # A tibble: 1 x 1
## x08
## <dbl>
## 1 78.3