A tornado plot is a visualization of the range of outputs expected from a variety of inputs, or alternatively, the sensitivity of the output to the range of inputs. Tornado plots have a number of features:
An importance plot attempts to dispay the relative impact of each variable on the model fit. Traditionally, for linear models, the concept of importance was expressed as the percentage of total response variable variance that is explained by each variable, either alone, or in the presence of the other variables in the model. Each model type can have different measures of importance. In this package, the following methods are used:
caret
package models: The definition of variable
importance for each model in the caret
package is taken
from the varImp
method. See the caret
package
for specifics.A few standard data sets are used in these examples:
mtcars
: Motor Trend Car Road Tests - predict Miles Per
Gallon (mpg) based on other car factorsmtcars_w_factors
: the mtcars
dataset with
a few numeric variables replaced with categorical or factor variables
where appropriate. e.g. Automatic vs Manual transmissionsurvival::ovarian
: Ovarian Cancer Survival Data -
predict survival timesurvival::bladder
: Bladder Cancer recurrences - risk of
bladder cancer recurrenceiris
: Edgar Anderson’s Iris Data - predict the type of
iris from measurements<- mtcars
mtcars_w_factors # Automatic vs Manual
$am <- factor(mtcars$am)
mtcars_w_factors# V or Straight cylinder arrangement
$vs <- factor(mtcars$vs)
mtcars_w_factors# number of cylinders
$cyl <- factor(mtcars$cyl)
mtcars_w_factors# number of forward gears
$gear <- factor(mtcars$gear)
mtcars_w_factors# number of carburetors
$carb <- factor(mtcars$carb) mtcars_w_factors
<- lm(mpg ~ cyl*wt*hp, data = mtcars)
lm1 <- tornado::tornado(lm1, type = "PercentChange", alpha = 0.10)
torn1 plot(torn1, xlabel = "MPG", geom_bar_control = list(width = 0.4))
<- tornado::tornado(lm1, type = "ranges")
torn2 plot(torn2, xlabel = "MPG", geom_bar_control = list(width = 0.4))
<- tornado::tornado(lm1, type = "percentiles", alpha = 0.05)
torn3 plot(torn3, xlabel = "MPG", geom_bar_control = list(width = 0.4))
<- lm(mpg ~ cyl + wt + hp + vs, data = mtcars_w_factors)
lm4 <- tornado::tornado(lm4, type = "percentiles", alpha = 0.05)
torn4 plot(torn4, xlabel = "MPG", geom_bar_control = list(width = 0.4))
<- list(old = c("cyl", "wt", "hp", "vs"),
dict new = c("Cylinders", "Weight", "Horsepower", "V_or_Straight"))
<- tornado::tornado(lm4, type = "percentiles", alpha = 0.05, dict = dict)
torn5 plot(torn5, xlabel = "MPG", geom_bar_control = list(width = 0.4))
<- list(old = c("cyl", "wt", "hp", "vs"),
dict new = c("Cylinders", "Weight", "Horsepower", "V_or_Straight"))
<- tornado::tornado(lm4, type = "percentiles", alpha = 0.05)
torn5 plot(torn5, xlabel = "MPG", geom_bar_control = list(width = 0.4),
sensitivity_colors = c("#FC8D62", "#66C2A5"),
geom_point_control = list(size = 3, fill = "purple", col = "purple"))
Notes:
<- plot(torn5, plot = FALSE, xlabel = "MPG", geom_bar_control = list(width = 0.4),
g sensitivity_colors = c("#FC8D62", "#66C2A5"),
geom_point_control = list(size = 3, fill = "purple", col = "purple"))
<- g + ggtitle("Test Plot")
g <- g + geom_hline(yintercept = 0, col = "black", lwd = 2)
g #> Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
#> ℹ Please use `linewidth` instead.
plot(g)
Predict if an engine is a V or Straight given covariates
<- glm(vs ~ wt + disp + cyl, data = mtcars, family = binomial(link = "logit"))
glm1 <- tornado::tornado(glm1, type = "ranges", alpha = 0.10)
torn1 plot(torn1, xlabel = "V or Straight Engine", geom_bar_control = list(width = 0.4))
<- survival::survreg(survival::Surv(futime, fustat) ~ ecog.ps + rx + age + resid.ds,
survreg1 ::ovarian, dist = 'weibull', scale = 1)
survival<- tornado::tornado(survreg1, modeldata = survival::ovarian,
torn1 type = "PercentChange", alpha = 0.10)
plot(torn1, xlabel = "Survival Time", geom_bar_control = list(width = 0.4))
<- survival::coxph(survival::Surv(stop, event) ~ rx + size + number,
coxph1 ::bladder)
survival<- tornado::tornado(coxph1, modeldata = survival::bladder, type = "PercentChange",
torn1 alpha = 0.10)
plot(torn1, xlabel = "Risk", geom_bar_control = list(width = 0.4))
if (has_glmnet)
{<- formula(mpg ~ cyl*wt*hp)
form <- model.frame(form, data=mtcars)
mf <- model.matrix(form, mf)
mm <- glmnet::cv.glmnet(x = mm, y = mtcars$mpg, family = "gaussian")
gtest <- tornado::tornado(gtest, modeldata = mtcars,
torn form = formula(mpg ~ cyl*wt*hp),
s = "lambda.1se",
type = "PercentChange", alpha = 0.10)
plot(torn, xlabel = "MPG", geom_bar_control = list(width = 0.4))
else
}
{print("glmnet is not available for vignette rendering")
}
if (has_caret)
{<- caret::train(x = subset(mtcars_w_factors, select = -mpg),
gtest y = mtcars_w_factors$mpg, method = "rf")
<- tornado::tornado(gtest, type = "percentiles", alpha = 0.10)
torn plot(torn, xlabel = "MPG")
else
}
{print("caret is not available for vignette rendering")
}
The plot
method can also return a ggplot object
if (has_caret)
{<- caret::train(x = subset(iris, select = -Species),
gtest y = iris$Species, method = "rf")
<- tornado::tornado(gtest, type = "percentiles", alpha = 0.10, class_number = 1)
torn <- plot(torn, plot = FALSE, xlabel = "Probability of the Setosa Species")
g <- g + ggtitle("Classifier caret::train 'rf', 10th to 90th perc. of each var.")
g plot(g)
<- tornado::tornado(gtest, type = "percentiles", alpha = 0.10, class_number = 2)
torn <- plot(torn, plot = FALSE, xlabel = "Probability of the versicolor Species")
g plot(g)
else
}
{print("caret is not available for vignette rendering")
}
<- lm(mpg ~ cyl*wt*hp + gear + carb, data = mtcars)
gtest <- lm(mpg ~ 1, data = mtcars)
gtestreduced <- tornado::importance(gtest, gtestreduced)
imp plot(imp)
<- list(old = c("cyl", "wt", "hp", "vs", "gear", "carb"),
dict new = c("Cylinders", "Weight", "Horsepower", "V_or_Straight", "Num Gears", "Num Carbs"))
<- tornado::importance(gtest, gtestreduced, dict = dict)
imp plot(imp, col_importance_alone = "#8DD3C7",
col_importance_cumulative = "#FFFFB3")
<- glm(vs ~ wt + disp + gear, data = mtcars, family = binomial(link = "logit"))
gtest <- glm(vs ~ 1, data = mtcars, family = binomial(link = "logit"))
gtestreduced <- tornado::importance(gtest, gtestreduced)
imp plot(imp)
<- survival::survreg(survival::Surv(futime, fustat) ~ ecog.ps*rx + age,
model_final data = survival::ovarian,
dist = "weibull")
<- tornado::importance(model_final, survival::ovarian, nperm = 100)
imp plot(imp, geom_bar_control = list(width = 0.4, fill = "blue"))
The number of variables plotted can also be controlled
if (has_caret)
{<- caret::train(x = subset(mtcars_w_factors, select = -mpg),
gtest y = mtcars_w_factors$mpg, method = "rf")
<- tornado::importance(gtest)
imp plot(imp, nvar = 7)
else
}
{print("caret is not available for vignette rendering")
}
The plot
method can also return a ggplot object
if (has_caret)
{<- caret::train(x = subset(iris, select = -Species),
gtest y = iris$Species, method = "rf")
<- tornado::importance(gtest)
imp <- plot(imp, plot = FALSE)
g <- g + ggtitle("Classifier caret::train randomforest: variable importance")
g plot(g)
else
}
{print("caret is not available for vignette rendering")
}