The multGLM function, now included in the fuzzySim package (Barbosa, 2015), automatically calculates binomial GLMs for one or more species (or other binary variables) in a data frame. The function can optionally perform stepwise variable selection (and it does so by default) instead of forcing all variables into the models, starting from either the null model (the default, so selection starts forward) or from the full model (so selection starts backward) and using either AIC (Akaike’s Information Criterion) or BIC (Bayesian Information Criterion, also known as Schwarz criterion, SBC or SBIC) as a variable selection criterion. Instead or subsequently, it can also perform stepwise removal of non-significant variables from the models using the modelTrim function. There is also an optional preliminary selection of variables with a significant/informative bivariate relationship with the response, based on the false discovery rate (with different methods also available), for which you need the FDR function (but note that some variables may be significant and informative in a multivariate model even if they would not have been selected by FDR). It can also do a preliminary selection of non-correlated variables, which requires the corSelect function, available in fuzzySim since version 1.5. If you want favourability (F) to be calculated (as it is by default), you need the Fav function as well. By default, all data are used in model training, but you can define an optional test.sample to be reserved for model testing afterwards.
multGLM <- function (data, sp.cols, var.cols, id.col = NULL,
family = "binomial", test.sample = 0, FDR = FALSE, correction = "fdr", corSelect = FALSE, cor.thresh = 0.8, step = TRUE, trace = 0, start = "null.model", direction = "both", select = "AIC", trim = TRUE, Y.prediction = FALSE, P.prediction = TRUE, Favourability = TRUE, group.preds = TRUE,
TSA = FALSE, coord.cols = NULL, degree = 3, verbosity = 2, ...)
{
# version 5.1 (6 Jan 2020)
start.time <- Sys.time()
on.exit(timer(start.time))
input.ncol <- ncol(data)
input.names <- colnames(data)
stopifnot(as.vector(na.omit(as.matrix(data[, sp.cols]))) %in%
c(0, 1), all(sp.cols %in% (1:input.ncol)) || all(sp.cols %in%
input.names), all(var.cols %in% (1:input.ncol)) || all(var.cols %in%
input.names), is.null(id.col) || (length(id.col) == 1 &&
(id.col %in% (1:input.ncol) || id.col %in% input.names)),
is.null(coord.cols) || all(coord.cols %in% (1:input.ncol)) ||
all(coord.cols %in% input.names), family == "binomial",
test.sample >= 0 | test.sample == "Huberty", length(test.sample) ==
1 | (is.integer(test.sample) & test.sample > 0),
length(test.sample) < nrow(data), is.logical(FDR), is.logical(step),
start %in% c("null.model", "full.model"), direction %in%
c("both", "backward", "forward"), select %in% c("AIC",
"BIC"), is.logical(Y.prediction), is.logical(P.prediction),
is.logical(Favourability), is.logical(group.preds), is.logical(trim),
is.logical(TSA), !Favourability | exists("Fav"), !trim |
exists("modelTrim"))
data$sample <- "train"
n <- nrow(data)
data.row <- 1:n
test.sample.input <- test.sample
if (length(test.sample) == 1) {
if (test.sample == "Huberty") {
if (!FDR & !step & !trim) {
test.sample <- percentTestData(length(var.cols))/100
n.test <- round(n * test.sample)
if (verbosity > 0)
message("Following Huberty's rule, ", test.sample *
100, "% of observations\n (", n.test,
" out of ", n, ") set aside for model testing; ",
n - n.test, " observations used for model training.")
}
else stop("Sorry, Huberty's rule cannot be used with 'FDR', 'step' or 'trim',\n as these make the number of variables vary.\n Set these 3 parameters to FALSE,\n or use a different 'test.sample' option.")
}
else if (test.sample == 0) {
if (verbosity > 0)
message("All ", n, " observations used for model training;\n none reserved for model testing.")
n.test <- 0
}
else if (test.sample < 1) {
n.test <- round(n * test.sample)
if (verbosity > 0)
message(test.sample * 100, "% of observations (",
n.test, " out of ", n, ") set aside for model testing; ",
n - n.test, " observations used for model training.")
}
else if (test.sample >= 1) {
n.test <- test.sample
if (verbosity > 0)
message(n.test, " (out of ", n, ") observations set aside for model testing; ",
n - n.test, " observations used for model training.")
}
test.sample <- sample(data.row, size = n.test, replace = FALSE)
}
else if (length(test.sample) > 1) {
n.test <- length(test.sample)
if (verbosity > 0)
message(n.test, " (out of ", n, ") observations set aside for model testing; ",
n - n.test, " observations used for model training.")
}
data$sample[data.row %in% test.sample] <- "test"
train.data <- data[data$sample == "train", ]
if (Favourability) {
if (family != "binomial") {
Favourability <- FALSE
warning("Favourability is only applicable to binomial responses,\n so it could not be calculated")
}
}
if (!is.numeric(sp.cols))
sp.cols <- which(colnames(data) %in% sp.cols)
if (!is.numeric(var.cols))
var.cols <- which(colnames(data) %in% var.cols)
if (!is.null(coord.cols) && !is.numeric(coord.cols))
coord.cols <- which(colnames(data) %in% coord.cols)
if (!is.null(id.col) && !is.numeric(id.col))
id.col <- which(colnames(data) %in% id.col)
keeP <- P.prediction
if (Favourability)
P.prediction <- TRUE
n.models <- length(sp.cols)
n.preds <- n.models * (Y.prediction + P.prediction + Favourability)
models <- vector("list", n.models)
predictions <- matrix(NA, nrow = nrow(data), ncol = n.preds)
colnames(predictions) <- rep("", n.preds)
model.count <- 0
pred.count <- 0
for (s in sp.cols) {
model.count <- model.count + 1
response <- colnames(train.data)[s]
if (verbosity > 0)
cat("\n\n=> Building model ", model.count, " of ",
n.models, " (", response, ")...\n\n", sep = "")
if (verbosity > 1)
cat(length(var.cols), "input predictor variable(s)\n\n")
if (TSA) {
if (verbosity > 1)
cat("...plus the spatial trend variable\n\n")
tsa <- suppressMessages(multTSA(data = data, sp.cols = s,
coord.cols = coord.cols, degree = degree, type = "Y"))
tsa_name <- paste("sptrend", response, sep = "_")
data <- data.frame(data, tsa[, ncol(tsa)])
names(data)[ncol(data)] <- tsa_name
train.data <- data.frame(train.data, tsa[which(data$sample ==
"train"), ncol(tsa)])
names(train.data)[ncol(train.data)] <- tsa_name
var.cols <- c(var.cols, ncol(train.data))
}
if (FDR) {
fdr <- FDR(data = train.data, sp.cols = s, var.cols = var.cols,
correction = correction, verbose = FALSE)
if (nrow(fdr$select) == 0) {
message(paste0("No variables passed the FDR test (so no variables included in the model)\n for '",
response, "'. Consider using 'FDR = FALSE' or choosing a less stringent 'correction' procedure."))
}
if (verbosity > 1)
cat(length(var.cols) - nrow(fdr$select), "variable(s) excluded by 'FDR' function\n",
paste(row.names(fdr$exclude), collapse = ", "),
"\n\n")
sel.var.cols <- which(colnames(train.data) %in% rownames(fdr$select))
}
else sel.var.cols <- var.cols
if (length(sel.var.cols) > 1 && corSelect == TRUE) {
corselect <- suppressMessages(corSelect(data = train.data,
sp.cols = s, var.cols = sel.var.cols, cor.thresh = cor.thresh,
use = "pairwise.complete.obs"))
corsel.var.cols <- corselect$selected.var.cols
if (verbosity > 1)
cat(length(sel.var.cols) - length(corsel.var.cols),
"variable(s) excluded by 'corSelect' function\n",
corselect$excluded.vars, "\n\n")
sel.var.cols <- corsel.var.cols
}
if (length(sel.var.cols) == 0)
model.vars <- 1
else model.vars <- colnames(train.data)[sel.var.cols]
model.formula <- with(train.data, as.formula(paste(response,
"~", paste(model.vars, collapse = "+"))))
model.expr <- expression(glm(model.formula, family = binomial))
if (step && length(sel.var.cols) > 0) {
n.vars.start <- length(sel.var.cols)
if (select == "AIC")
K <- 2
else if (select == "BIC")
K <- log(n)
if (start == "full.model") {
model <- step(eval(model.expr), direction = direction,
trace = trace, k = K)
}
else if (start == "null.model") {
model.scope <- model.formula[-2]
null.formula <- as.formula(paste(response, "~",
1))
model <- step(glm(null.formula, family = binomial,
data = train.data), direction = direction,
scope = model.scope, trace = trace, k = K)
}
else stop("'start' must be either 'full.model' or 'null.model'")
n.vars.step <- length(model$coefficients) - 1
excluded.vars <- setdiff(colnames(data[, sel.var.cols]),
names(model$coefficients)[-1])
if (verbosity > 1)
cat(n.vars.start - n.vars.step, "variable(s) excluded by 'step' function\n",
paste(excluded.vars, collapse = ", "), "\n\n")
}
else model <- eval(model.expr, envir = train.data)
if (trim && length(sel.var.cols) > 0) {
n.vars.start <- length(model$coefficients) - 1
names.vars.start <- names(model$coefficients)[-1]
model <- suppressMessages(modelTrim(model, ...))
n.vars.trim <- length(model$coefficients) - 1
excluded.vars <- setdiff(names.vars.start, names(model$coefficients)[-1])
if (verbosity > 1)
cat(n.vars.start - n.vars.trim, "variable(s) excluded by 'modelTrim' function\n",
paste(excluded.vars, collapse = ", "), "\n\n")
}
if (step || trim) {
sel.var.names <- names(model$coefficients)[-1]
if (verbosity > 1)
cat(length(sel.var.names), "variable(s) INCLUDED IN THE FINAL MODEL\n",
paste(sel.var.names, collapse = ", "))
}
if (verbosity > 1)
cat("\n\n")
models[[model.count]] <- model
names(models)[[model.count]] <- response
if (Y.prediction) {
pred.count <- pred.count + 1
colnames(predictions)[pred.count] <- paste(response,
"Y", sep = "_")
predictions[, pred.count] <- predict(model, data)
}
if (P.prediction) {
pred.count <- pred.count + 1
colnames(predictions)[pred.count] <- paste(response,
"P", sep = "_")
predictions[, pred.count] <- predict(model, data,
type = "response")
}
if (Favourability) {
n1 <- sum(train.data[, s] == 1, na.rm = TRUE)
n0 <- sum(train.data[, s] == 0, na.rm = TRUE)
pred.count <- pred.count + 1
predictions[, pred.count] <- Fav(n1n0 = c(n1, n0),
pred = predictions[, pred.count - 1])
colnames(predictions)[pred.count] <- paste(response,
"F", sep = "_")
}
if (TSA) {
train.data <- train.data[, -ncol(train.data)]
data <- data[, -ncol(data)]
var.cols <- var.cols[-length(var.cols)]
}
}
if (P.prediction && !keeP) {
n.char <- nchar(colnames(predictions))
pred.suffix <- substr(colnames(predictions), n.char -
1, n.char)
P.cols <- grep("_P", pred.suffix)
predictions <- predictions[, -P.cols]
}
n.pred.types <- sum(Y.prediction, keeP, Favourability)
if (n.pred.types == 0) {
predictions <- data.frame()
}
else {
predictions <- data.frame(data[, id.col], sample = data[,
"sample"], predictions)
if (n.pred.types == 1 || length(sp.cols) == 1)
group.preds <- FALSE
if (group.preds) {
first.pred.col <- ifelse(is.null(id.col), 2, 3)
pred1.cols <- seq(first.pred.col, ncol(predictions),
by = n.pred.types)
pred2.cols <- seq(first.pred.col + 1, ncol(predictions),
by = n.pred.types)
pred3.cols <- NULL
if (n.pred.types == 3) {
pred3.cols <- seq(first.pred.col + 2, ncol(predictions),
by = n.pred.types)
}
predictions <- data.frame(data[, id.col], sample = data$sample,
predictions[, pred1.cols], predictions[, pred2.cols],
predictions[, pred3.cols])
}
if (!is.null(id.col)) {
colnames(predictions)[1] <- input.names[id.col]
}
}
if (test.sample.input == 0) {
predictions <- predictions[, -match("sample", colnames(predictions))]
}
if (!is.null(id.col)) {
colnames(predictions)[1] <- colnames(data)[id.col]
}
selected_variables <- lapply(models, function(x) names(x$model)[-1])
message("Finished!")
return(list(predictions = predictions, models = models, variables = selected_variables))
}
REFERENCES:
Barbosa A.M. (2015) fuzzySim: applying fuzzy logic to binary similarity indices in ecology. Methods in Ecology and Evolution, 6: 853-858