Select among correlated variables based on a given criterion

Correlations among variables are problematic in multivariate models, as they inflate the variance of coefficients and thus may bias the interpretation of the effects of those variables on the response. One of the strategies to circumvent this problem is to eliminate a priori one from each pair of correlated variables, but it is not always straightforward to pick the right variable among these. The corSelect function selects among correlated variables based either on their (stepwise) variance inflation factor (VIF); or on their relationship with the response variable, by building a bivariate model of each individual variable against the response and choosing, among each of two correlated variables, the one with the best p-value, AIC or BIC.

This function is now included in fuzzySim and depends on other functions in the package, such as FDR and multicol. It is also included as a new option within function multGLM in this package, so that multiple models can be built based on uncorrelated variables chosen appropriately for each species.

corSelect <- function(data, sp.cols = NULL, var.cols, cor.thresh = 0.8, select = "p.value", ...) {

# version 1.6 (6 Jul 2017)

if (length(sp.cols) > 1) stop ("Sorry, 'corSelect' is currently implemented for only one 'sp.col' at a time.")

univar.criteria <- c("VIF")
 bivar.criteria <- c("p.value", "AIC", "BIC")

if (!(select %in% c(univar.criteria, bivar.criteria))) stop ("Invalid 'select' criterion.")

if (!is.null(sp.cols) & select %in% bivar.criteria) {
 n.in <- nrow(data)
 data <- data[is.finite(data[ , sp.cols]), ]
 n.out <- nrow(data)
 if (n.out < n.in) warning (n.in - n.out, " observations removed due to missing data in 'sp.cols'; ", n.out, " observations actually evaluated.")
 }

cor.mat <- cor(data[ , var.cols], ...)
 cor.mat[upper.tri(cor.mat, diag = TRUE)] <- NA
 high.cor.mat <- bivar.mat <- numeric(0)

if (max(abs(cor.mat), na.rm = TRUE) >= cor.thresh) {
 high.cor.rowcol <- as.matrix(which(abs(cor.mat) >= cor.thresh, arr.ind = TRUE))
 high.cor.inds <- sort(unique(as.vector(high.cor.rowcol)))
 high.cor.mat <- data.frame(var1 = rownames(cor.mat)[high.cor.rowcol[ , "row"]], var2 = colnames(cor.mat)[high.cor.rowcol[ , "col"]])
 high.cor.mat$corr <- NULL
 for (r in 1:nrow(high.cor.mat)) high.cor.mat$corr[r] <- cor.mat[high.cor.rowcol[ ,"row"][r], high.cor.rowcol[ ,"col"][r]]
 high.cor.mat <- high.cor.mat[order(abs(high.cor.mat$corr), decreasing = TRUE), ]

if (is.null(sp.cols) & select %in% bivar.criteria) {
 message("Bivariate relationships not assessable without a response variable ('sp.cols'). Returning high pairwise correlations among 'var.cols'.")
 return (high.cor.mat)
 }

high.cor.vars <- unique(rownames(cor.mat[high.cor.inds, high.cor.inds]))

if (select %in% bivar.criteria) {
 bivar.mat <- FDR(data = data, sp.cols = sp.cols, var.cols = match(high.cor.vars, colnames(data)), simplif = TRUE)[ , c("p.value", "AIC", "BIC")]
 if (all.equal(order(bivar.mat[ , c("p.value")]), order(bivar.mat[ , c("AIC")]), order(bivar.mat[ , c("BIC")]))) message("Results identical using whether p-value, AIC or BIC to select variables.\n")
 else message("Results NOT identical using whether p-value, AIC or BIC to select variables.\n")
 } # end if select in bivar

data.remaining <- data[ , var.cols]
 while (max(abs(cor.mat), na.rm = TRUE) >= cor.thresh) {
 max.cor.ind <- which(abs(cor.mat) == max(abs(cor.mat), na.rm = TRUE), arr.ind = TRUE)
 v1 <- rownames(cor.mat)[max.cor.ind[1]]
 v2 <- colnames(cor.mat)[max.cor.ind[2]]
 if (select %in% univar.criteria) {
 vif <- multicol(data.remaining)
 targets <- vif[c(v1, v2), "VIF", drop = FALSE]
 } # end if univar
 else if (select %in% bivar.criteria) {
 targets <- bivar.mat[c(v1, v2), select, drop = FALSE]
 } # end if bivar

exclude <- which.max(as.matrix(targets))
 excl.name <- rownames(targets)[exclude]
 excl.ind <- match(excl.name, colnames(cor.mat))
 data.remaining <- data.remaining[ , -excl.ind, drop = FALSE]
 cor.mat <- cor.mat[-excl.ind, -excl.ind, drop = FALSE]
 if (all(is.na(cor.mat))) cor.mat <- numeric(0)
 if (length(cor.mat) == 0) break
 } # end while max

} # end if max > thresh

selected.vars <- colnames(cor.mat)
 selected.var.cols <- match(colnames(cor.mat), colnames(data))
 excluded.vars <- colnames(data)[var.cols][!(colnames(data)[var.cols] %in% colnames(cor.mat))]

vif2 <- multicol(data[ , selected.var.cols])

list(high.correlations = high.cor.mat,
 bivariate.significance = bivar.mat,
 excluded.vars = excluded.vars,
 selected.vars = selected.vars,
 selected.var.cols = selected.var.cols,
 strongest.remaining.corr = cor.mat[which.max(abs(cor.mat))],
 remaining.multicollinearity = vif2
 )
} # end corSelect function

 

You can install and load fuzzySim (>=version 1.5) and then check help(corSelect) for reproducible usage examples.

Analyse and compare stepwise model predictions

This function builds a generalized linear model with forward stepwise inclusion of variables, using AIC as the selection criterion, and gives the values predicted at each step, as well as their correlation with the final model predictions. This can be useful to see if a model with just the first few variables could already provide predictions very close to the final ones (see e.g. Fig. 3 in Muñoz et al., 2005). This function is now included in the fuzzySim package (Barbosa, 2015; version 1.3 just released). The correlation coefficient (cor.method) is “pearson” by default, but can be set to “spearman” or “kendall” as well.

stepByStep <- function(data, # name of dataset
                       sp.col, # index of species (target variable) column
                       var.cols, # indices of variables columns
                       family = binomial(link = "logit"),
                       Favourability = FALSE, # used only if family=binomial(logit)
                       trace = 0, # argument for 'step'
                       cor.method = "pearson") {
 
# version 1.3 (22 Jul 2015)
 
n.init <- nrow(data)
data <- na.omit(data)
na.loss <- n.init - nrow(data)
if (na.loss > 0) message(na.loss, " cases excluded due to missing values.")
 
response <- colnames(data)[sp.col]
null.model.formula <- as.formula(paste(response, "~", 1))
scope.formula <- as.formula(paste("~", paste(colnames(data)[var.cols], collapse = "+")))
mod <- step(glm(null.model.formula, family = family, data = data), scope = scope.formula, direction = "forward", trace = trace)
pred.final <- mod$fitted.values
 
if (!all(c("binomial", "logit") %in% mod$family)) Favourability <- FALSE
if (Favourability) fav.final <- Fav(model = mod)
 
model.vars.split <- sapply(mod$anova[ , 1], strsplit, split = " ")
model.vars <- lapply(model.vars.split, `[`, 2)
model.vars <- as.character(model.vars)[-1]
n.steps <- length(model.vars)
 
preds <- favs <- as.data.frame(matrix(nrow = nrow(data), ncol = n.steps))
cor.P <- cor.F <- vector("numeric", n.steps)
names(model.vars) <- names(preds) <- names(favs) <- names(cor.P) <- names(cor.F) <- paste0("step", 1:n.steps)
 
for (s in 1:n.steps) {
step.vars <- model.vars[1:s]
mod.step <- glm(as.formula(paste(response, "~", paste(step.vars, collapse = "+"))), family = family, data = data)
if (Favourability) {
favs[ , s] <- Fav(model = mod.step)
cor.F[s] <- cor(favs[ , s], fav.final, method = cor.method)
}
else {
preds[ , s] <- mod.step$fitted.values
cor.P[s] <- cor(preds[ , s], pred.final, method = cor.method)
}
}; rm(s)
 
if (Favourability) result <- list(predictions = favs, correlations = cor.F, variables = model.vars, model = mod)
else result <- list(predictions = preds, correlations = cor.P, variables = model.vars, model = mod)
 
return(result)
}

Usage example:

library(fuzzySim)
data(rotif.env)
 
stepByStep(data = rotif.env, sp.col = 18, var.cols = 5:17, cor.method = "spearman")
 
stepByStep(data = rotif.env, sp.col = 18, var.cols = 5:17, Favourability = TRUE, cor.method = "pearson")
 
stepByStep(data = rotif.env, sp.col = 18, var.cols = 5:17, family = poisson)

[presented with Pretty R]

Only forward selection is implemented for now. I may or may not eventually find the time to implement other selection methods.

REFERENCES

Barbosa A.M. (2015) fuzzySim: applying fuzzy logic to binary similarity indices in ecology. Methods in Ecology and Evolution, 6: 853-858

Muñoz, A.R., Real R., BARBOSA A.M. & Vargas J.M. (2005) Modelling the distribution of Bonelli’s Eagle in Spain: Implications for conservation planning. Diversity and Distributions 11: 477-486

Probability of occurrence from presence-background data in a data frame

In an interesting recent paper, Royle et al. (2012) presented a critical analysis of Maxent and proposed a method for obtaining probability of occurrence (rather than a vague suitability index) based on maximum likelihood from presence-background (rather than presence-absence) species distribution data. They provided an R package, maxlike, which can apply this method to a set of presence point coordinates and a raster stack of predictor variables. However, this implies not only that you need point coordinates of your species observations, but also that sampling units are square pixels, whereas we often need to model distribution data recorded on larger and more irregular units — such as provinces, river basins, or UTM cells, for example.

The maxlike.df function is a modification of maxlike that can be applied to a data frame instead of spatial objects. I added argument covariates (to use instead of rasters), which requires a matrix or data.frame of the variables (one in each column, with the same names used in argument formula), and argument presences (to use instead of points) for a vector/column with the species data (of the same size and in the same order as the covariates, with 1 for presence and 0 or NA for background or no records — mind that MaxLike, like MaxEnt, needs both presence and background data, so they’re not strictly “presence-only”). Everything else in maxlike.df works the same way as in maxlike (Chandler & Royle 2013), so please read the maxlike documentation for further information on how the method is implemented and interpreted. Note that they recommend that the variables be standardized (e.g. vars = scale(vars)) before modelling, and that you should select which variables to include in each model.

Note also that Royle et al.‘s (2012) approach has been supported by Fitzpatrick et al. (2013), but criticized by Hastie & Fithian (2013).

UPDATE: the latest version of the maxlike package (0.1-7) now allows models to be calibrated with data frames, via arguments x (data.frame with environmental conditions at presence point locations) and z (data.frame with environmental conditions at background locations). The function below is therefore deprecated, and you should use the maxlike package instead.

maxlike.df <- function (formula, rasters, points, presences, covariates, link = c("logit", "cloglog"), starts, hessian = TRUE, fixed, removeDuplicates = FALSE, savedata = FALSE, na.action = "na.omit", ...)
  {
  if (missing(rasters))  rasters <- covariates  # new line
  if (missing(points))  points <- presences  # new line
  if (identical(formula, ~1))
    stop("At least one continuous covariate must be specified in the formula")
  link <- match.arg(link)
  varnames <- all.vars(formula)
  call <- match.call()
  npts <- sum(points == 1)  # was nrow(points)
  cd.class <- class(rasters)[1]
  #if (cd.class != "rasterstack") stop("rasters must be a raster stack")
  #pt.class <- class(points)[1]
  #if (!pt.class %in% c("matrix", "data.frame")) stop("points must be a matrix or a data.frame")
  #if (ncol(points) != 2) stop("points must have 2 columns containing the x- and y- coordinates")
  pt.names <- colnames(points)
  #if (identical(cd.class, "rasterstack")) {
    cd.names <- names(rasters)
    npix <- nrow(rasters) # was  prod(dim(rasters)[1:2])
    id <- 1:nrow(rasters)  # new line
    cellID <- id[points == 1] # was cellFromXY(rasters, points)
    duplicates <- duplicated(cellID)
    if (removeDuplicates) {
      cellID <- unique(cellID)
      npts <- length(cellID)
      points.retained <- points[!duplicates, ]
    }
    x <- as.data.frame(rasters[points == 1, ])  # was as.data.frame(matrix(extract(rasters, cellID), npts))
    z <- rasters  # was as.data.frame(matrix(getValues(rasters), npix))
    names(x) <- names(z) <- cd.names
  #}
  if (!all(varnames %in% cd.names)) stop("at least 1 covariate in the formula is not in names(rasters).")
  X.mf <- model.frame(formula, x, na.action = na.action)
  X.mf.a <- attributes(X.mf)
  pts.removed <- integer(0)
  points.retained <- points
  if ("na.action" %in% names(X.mf.a)) {
    pts.removed <- X.mf.a$na.action
    npts.removed <- length(pts.removed)
    if (npts.removed > 0) {
      warning(paste(npts.removed, "points removed due to missing values"))
      points.retained <- points.retained[-pts.removed]  # I removed the comma before ']', as here 'points' is a vector rather than a two-column matrix/dataframe
    }
  }
  X <- model.matrix(formula, X.mf)
  Z.mf <- model.frame(formula, z, na.action = na.action)
  Z.mf.a <- attributes(Z.mf)
  pix.removed <- integer(0)
  if ("na.action" %in% names(Z.mf.a)) {
    pix.removed <- Z.mf.a$na.action
    npix.removed <- length(pix.removed)
  }
  Z <- model.matrix(formula, Z.mf)
  npars <- ncol(X)
  parnames <- colnames(X)
  if (!"(Intercept)" %in% parnames) 
    stop("The intercept must be estimated or fixed")
  if (missing(starts)) {
    starts <- rep(0, npars)
    names(starts) <- parnames
  }
  else names(starts) <- parnames
  if (identical(link, "logit")) {
    nll <- function(pars) {
      psix <- plogis(drop(X %*% pars))
      psiz <- sum(plogis(drop(Z %*% pars)))
      -1 * sum(log(psix/psiz))
    }
  }
  else if (identical(link, "cloglog")) {
    nll <- function(pars) {
      psix <- 1 - exp(-exp(drop(X %*% pars)))
      psiz <- sum(1 - exp(-exp(drop(Z %*% pars))))
      -1 * sum(log(psix/psiz))
    }
  }
  else stop("link function should be either 'logit' or 'cloglog'")
  is.fixed <- rep(FALSE, npars)
  if (!missing(fixed)) {
    if (length(fixed) != length(starts)) 
      stop("fixed should be a vector with the same length as the number of parameters to be estimated")
    if (sum(is.double(fixed)) < 1) 
      stop("fixed must contain at least one real value")
    is.fixed <- !is.na(fixed)
    if (sum(!is.fixed) < 1) 
      stop("you cannot fix all parameters in the model")
    npars <- sum(!is.fixed)
    nll.fix <- function(p) {
      p[is.fixed] <- fixed[is.fixed]
      do.call("nll", list(pars = p))
    }
    fm <- optim(starts, nll.fix, hessian = hessian, ...)
    fm$par[is.fixed] <- fixed[is.fixed]
  }
  else {
    fm <- optim(starts, nll, hessian = hessian, ...)
  }
  not.fixed <- !is.fixed
  par <- fm$par
  if (hessian) {
    vcTry <- try(solve(fm$hessian[not.fixed, not.fixed]))
    if (identical(class(vcTry), "matrix")) {
      vc <- matrix(0, length(par), length(par))
      vc[not.fixed, not.fixed] <- vcTry
      se <- sqrt(diag(vc))
    }
    else {
      vc <- matrix(NA, npars, npars)
      se <- rep(NA, npars)
    }
  }
  else {
    vc <- matrix(NA, npars, npars)
    se <- rep(NA, npars)
  }
  dimnames(vc) <- list(parnames, parnames)
  aic <- 2 * fm$value + 2 * npars
  fitted <- plogis(Z %*% par)
  fitted.values <- merge(data.frame(id = id), data.frame(id = rownames(fitted), fitted = fitted), all = TRUE)  # new line
  out <- list(Est = cbind(Est = par, SE = se), vcov = vc, AIC = aic, call = call, pts.removed = pts.removed, pix.removed = pix.removed, points.retained = points.retained, optim = fm, not.fixed = not.fixed, link = link, fitted.values = fitted.values$fitted)  # 'fitted.values' added
  if (class(rasters) %in% c("matrix", "data.frame"))  savedata = TRUE  # new line, otherwise 'predict' would fail
  if (savedata)
    out$rasters <- rasters
  class(out) <- c("maxlikeFit", "list")
  return(out)
}

# some changes to predict.maxlikeFit were also required to accomodate maxlike.df:
predict.maxlikeFit <- function (object, ...) {
  e <- coef(object)
  rasters <- object$rasters
  if (is.null(rasters)) {
    rasters <- try(get(as.character(object$call$rasters)))
    if (identical(class(rasters)[1], "try-error")) 
      stop("could not find the raster data")
    warning("raster data were not saved with object, using the data found in the workspace instead.")
  }
  link <- object$link
  cd.names <- names(rasters)
  cd.class <- class(rasters)[1]  # new line
  if (cd.class == "RasterStack") {  # new line
    npix <- prod(dim(rasters)[1:2])
    z <- as.data.frame(matrix(getValues(rasters), npix))
  }  # new line
  else if (cd.class %in% c("matrix", "data.frame")) {  # new line
    npix <- nrow(rasters)  # new line
    z <- rasters  # new line
  }  else stop("'rasters' must be either a RasterStack, a matrix or a data.frame")  # new line
  names(z) <- cd.names
  formula <- object$call$formula
  varnames <- all.vars(formula)
  if (!all(varnames %in% cd.names)) 
    stop("at least 1 covariate in the formula is not in rasters.")
  Z.mf <- model.frame(formula, z, na.action = "na.pass")
  Z.terms <- attr(Z.mf, "terms")
  Z <- model.matrix(Z.terms, Z.mf)
  eta <- drop(Z %*% coef(object))
  if (identical(link, "logit")) 
    psi.hat <- plogis(eta)
  else if (identical(link, "cloglog")) 
    psi.hat <- 1 - exp(-exp(eta))
  else stop("link function should be either 'logit' or 'cloglog'")
  if (cd.class == "RasterStack") {  # new line
    psi.mat <- matrix(psi.hat, dim(rasters)[1], dim(rasters)[2], byrow = TRUE)
    psi.raster <- raster(psi.mat)
    extent(psi.raster) <- extent(rasters)
    psi.raster
  }  # new line
  else if (cd.class == "data.frame") return(psi.hat)  # new line
}

[presented with Pretty R]

Usage example for a data frame with the species presence data in column 2 and the variables (named var1var2var3 and var4) in columns 3 to 6:

myspecies.maxlike <- maxlike.df(formula = ~ var1 + var2 + var3 + var4, covariates = mydata[ , 3:6], presences = mydata[ , 2], method = "BFGS")

If you then want to explore the model with functions such as summary or predict, you need to install and load the maxlike package (which defines the appropriate class for this type of model). If not, just with the maxlike.df function you can already get some useful information from the model:

# get the model predictions applied to the analysed data:
myspecies.maxlike$fitted.values

# get the coefficient estimates and their standard errors:
myspecies.maxlike$Est

# get the model's AIC:
myspecies.maxlike$AIC

You can then evaluate maxlike.df predictions with functions from the modEvA package, as in the following examples:

library(modEvA)

AUC(obs = mydata$myspecies, pred = myspecies.maxlike$fitted.values)
HLfit(obs = mydata$myspecies, pred = myspecies_maxlike$fitted.values)
plotGLM(obs = mydata$myspecies, pred = myspecies_maxlike$fitted.values)

References

Chandler, R.B. & Royle, J.A. (2013) Package ‘maxlike’: Model species distributions by estimating the probability of occurrence using presence-only data. Version  0.1-4, CRAN

Fitzpatrick, M.C., Gotelli, N.J. & Ellison, A.M. (2013) MaxEnt versus MaxLike: empirical comparisons with ant species distributions. Ecosphere 4, art55.

Hastie, T. & Fithian, W. (2013) Inference from presence-only data; the ongoing controversy. Ecography 36, 864-867.

Royle, J.A., Chandler, R.B., Yackulic, C. & Nichols, J.D. (2012) Likelihood analysis of species occurrence probability from presence-only data for modelling species distributions. Methods in Ecology and Evolution 3: 545–554

Distribution modelling based on presence points

The pres.point.mod function uses the dismo package to run a series of models (currently Bioclim, Domain and Maxent) for a set of presence point coordinates and a raster stack of predictor variables. Optionally, it will also apply these models to an extrapolation raster stack, and write prediction and extrapolation maps to disk. The pres.point.mod function is not included in a package, but it’s in the “Supporting Information” of Zanolla et al. (2018), so please cite this paper if you use the function.

Note that Bioclim and Domain use only the information on the presence points, while Maxent uses also a set of background (pseudo-absence) points that are randomly generated if not provided by the user. Please read Hijmans & Elith’s “Species distribution modeling with R” for more info on these modelling methods and their R implementation.

pres.point.mod <- function(obs.xy, var.stack, extrap.stack = NULL, mod.methods = c("bioclim", "domain", "maxent"), bg.points = NULL, result.rasters.folder = NULL, result.rasters.format = "GTiff", overwrite = FALSE, ...) {
  # version 1.5 (8 Oct 2013)
  # obs.xy: a matrix or data frame containing 2 columns, the first with the x and the second with the y coordinates of the species presence points to be modelled
  # var.stack: a raster stack of the predictor variables to use in modelling, in the same coordinate reference system as obs.xy
  # extrap.stack: optionally, a raster stack of variables (with the same names and coordinate reference system as var.stack) to which to exrapolate the models
  # mod.methods: modelling methods to use; currently "bioclim", "domain" and "maxent" are implemented
  # bg.points: an optional matrix/dataframe of x-y coordinates of the background points or pseudo-absences to be used by maxent (the other 3 methods use only the presence points). If NULL, maxent will randomly generate 10000 points within the raster.stack
  # result.rasters.folder: optionally, the path to a folder (which will be created if not present) to save the rasters of model predictions to disk
  # result.rasters.format: format for the rasters to save in result.rasters.folder (type '?writeFormats' for options)
  # overwrite: logical, whether to overwrite raster files with the same name within the result.rasters.folder
  # ...: additional arguments for the 'maxent' function (see ?maxent details); relevant arguments may be 'a' for user-defined background points, 'path' for the folder in which to save maxent output files, and 'removeDuplicates' o specify whether duplicate presence points within the same pixel should be ignored

  start.time <- proc.time()
  if (!is.element("dismo", installed.packages()[,1])) stop("You need to have the 'dismo' package installed to run 'pres.point.mod'.")
  require(dismo)
  if(!is.null(result.rasters.folder)) dir.create(result.rasters.folder, showWarnings = FALSE)
  jar.path <- paste(system.file(package = "dismo"), "/java/", sep = "")
  if ("maxent" %in% mod.methods & !file.exists(paste(jar.path, "maxent.jar", sep = "")))  stop("'maxent.jar' file not found in '", jar.path, "; please put (a copy of) it there, or run 'pres.point.mod' without 'maxent' in 'mod.methods'", sep = "")
  results <- vector("list")

  if("bioclim" %in% mod.methods) {
    message("Building Bioclim model...  >>  ", Sys.time())
    bioc.model.expr <- expression(bioclim(x = var.stack, p = obs.xy))
    bioc.model <- tryCatch(eval(bioc.model.expr), error = function(e) NULL)
    if(!is.null(bioc.model)) {
      results[["bioc.model"]] <- bioc.model
      message("Making Bioclim prediction raster...  >>  ", Sys.time())
      bioc.rast <- predict(bioc.model, var.stack)
      results[["bioc.rast"]] <- bioc.rast
      if(!is.null(result.rasters.folder)) {
        message("Saving in result.rasters.folder...  >>  ", Sys.time())
        writeRaster(x = bioc.rast, filename = paste(result.rasters.folder, "Bioclim", sep = "/"), format = result.rasters.format, overwrite = overwrite)
      }  # end if result.rasters.folder
      if(!is.null(extrap.stack)) {
        message("Making Bioclim extrapolated raster...  >>  ", Sys.time())
        bioc.extrap.rast <- predict(bioc.model, extrap.stack)
        results[["bioc.extrap.rast"]] <- bioc.extrap.rast
        if(!is.null(result.rasters.folder)) {
          message("Saving in result.rasters.folder...  >>  ", Sys.time())
          writeRaster(x = bioc.extrap.rast, filename = paste(result.rasters.folder, "Bioclim_extrapolated", sep = "/"), format = result.rasters.format, overwrite = overwrite)
        }  # end if result.rasters.folder
      }  # end if extrap.stack
    }  # end if !null bioc model
    else message("...not! Bioclim model could not be obtained")
  }  # end if bioclim

  if("domain" %in% mod.methods) {
    message("Building Domain model...  >>  ", Sys.time())
    domain.model.expr <- expression(domain(x = var.stack, p = obs.xy))
    domain.model <- tryCatch(eval(domain.model.expr), error = function(e) NULL)
    if(!is.null(domain.model)) {
      results[["domain.model"]] <- domain.model
      message("Making Domain prediction raster...  >>  ", Sys.time())
      domain.rast <- predict(domain.model, var.stack)
      results[["domain.rast"]] <- domain.rast
      if(!is.null(result.rasters.folder)) {
        message("Saving in result.rasters.folder...  >>  ", Sys.time())
        writeRaster(x = domain.rast, filename = paste(result.rasters.folder, "Domain", sep = "/"), format = result.rasters.format, overwrite = overwrite)
      }  # end if result.rasters.folder
      if(!is.null(extrap.stack)) {
        message("Making Domain extrapolated raster...  >>  ", Sys.time())
        domain.extrap.rast <- predict(domain.model, extrap.stack)
        results[["domain.extrap.rast"]] <- domain.extrap.rast
        if(!is.null(result.rasters.folder)) {
          message("Saving in result.rasters.folder...  >>  ", Sys.time())
          writeRaster(x = domain.extrap.rast, filename = paste(result.rasters.folder, "Domain_extrapolated", sep = "/"), format = result.rasters.format, overwrite = overwrite)
        }  # end if result.rasters.folder
      }  # end if extrap.stack
    }  # end if !null domain model
    else message("...not! Domain model could not be obtained")
  }  # end if domain

  if("maxent" %in% mod.methods) {
    message("Building Maxent model...  >>  ", Sys.time())
    maxent.model.expr <- expression(maxent(x = var.stack, p = obs.xy, a = bg.points, ...))
    maxent.model <- tryCatch(eval(maxent.model.expr), error = function(e) NULL)
    if(!is.null(maxent.model)) {
      results[["maxent.model"]] <- maxent.model
      message("Making Maxent prediction raster...  >>  ", Sys.time())
      maxent.rast <- predict(maxent.model, var.stack)
      results[["maxent.rast"]] <- maxent.rast
      if(!is.null(result.rasters.folder)) {
        message("Saving in result.rasters.folder...  >>  ", Sys.time())
        writeRaster(x = maxent.rast, filename = paste(result.rasters.folder, "Maxent", sep = "/"), format = result.rasters.format, overwrite = overwrite)
      }  # end if result.rasters.folder
      if(!is.null(extrap.stack)) {
        message("Making Maxent extrapolated raster...  >>  ", Sys.time())
        maxent.extrap.rast <- predict(maxent.model, extrap.stack)
        results[["maxent.extrap.rast"]] <- maxent.extrap.rast
        if(!is.null(result.rasters.folder)) {
          message("Saving in result.rasters.folder...  >>  ", Sys.time())
          writeRaster(x = maxent.extrap.rast, filename = paste(result.rasters.folder, "Maxent_extrapolated", sep = "/"), format = result.rasters.format, overwrite = overwrite)
        }  # end if result.rasters.folder
      }  # end if extrap.stack
    }  # end if !null maxent model
    else message("...not! Maxent model could not be obtained")
  }  # end if maxent

  end.time <- proc.time()
  duration <- (end.time - start.time)[3]
  if (duration < 60) {
    units <- " second(s)"
  } else if (duration < 3600) {
    duration <- duration / 60
    units <- " minute(s)"
  } else {
    duration <- duration / 3600
    units <- " hour(s)"
  }

  message("Finished!  >>  ", Sys.time(), " -- it took ", round(duration),  units)
  return(results)
}  # end pres.point.mod function

[presented with Pretty R]

Please read the first lines of the function above for an explanation of what each parameter means.

Usage example:

mymodels <- pres.point.mod(obs.xy = mypoints, var.stack = predictor.stack, extrap.stack = var2050.stack, mod.methods = c("bioclim", "domain", "maxent"), bg.points = bg.points, result.rasters.folder = "myspecies_models", result.rasters.format = "GTiff", overwrite = FALSE, path = "myspecies_maxent_out")

pairs(mymodels$bioc.model)
response(mymodels$bioc.model)

plot(mymodels$bioc.rast)
plot(mymodels$domain.rast)
plot(mymodels$maxent.rast)

REFERENCES:

Zanolla M., Altamirano M., Carmona R., De La Rosa J., Souza-Egipsy V., Sherwood A., Tsiamis K., Barbosa A.M., Muñoz A.R. & Andreakis N. (2018) Assessing global range expansion in a cryptic species complex: insights from the red seaweed genus Asparagopsis (Florideophyceae). Journal of Phycology, 54: 12-24.

Trim off non-significant variables from a model

Stepwise variable selection is a common procedure for simplifying models. It maximizes predictive efficiency in an objective and reproducible way, and is useful when the individual importance of the predictors is not known a priori (Hosmer & Lemeshow, 2000). The step function in R performs such procedure using an information criterion (AIC) to select the variables, but it often leaves variables that are not significant in the model. Such variables can be subsequently removed with a manual stepwise procedure (e.g. Crawley 2007, p. 442; Barbosa & Real 2010, 2012; Estrada & Arroyo 2012). The modelTrim function, now included in the fuzzySim package (Barbosa, 2015), performs such removal automatically until all remaining variables are significant. It can also be applied to a full model (i.e., without previous use of the step function), as it serves as a backward stepwise selection procedure based on the significance of the coefficients (if method = ‘summary’, the default) or on the significance of the variables themselves (if method = ‘anova’, better when there are categorical variables in the model).

modelTrim <-
function(model, method = "summary", alpha = 0.05) {
# version 1.9 (16 Nov 2018)

stopifnot(
class(model) %in% c("glm", "lm"),
alpha > 0,
alpha < 1
)

dat <- as.data.frame(model$model) # new
n.vars.start <- length(model$coefficients) - 1
names.vars.start <- names(model$coefficients)[-1]

if (method == "summary") {
p.values <- expression(summary(model)$coefficients[ , 4])
} else if (method == "anova") {
p.values <- expression(as.matrix(anova(model, test = "Chi"))[ , 5])
} else stop ("'method' must be either 'summary' or 'anova'")

while (max(eval(p.values)[-1]) > alpha) { # excludes p-value of intercept
exclude <- names(which.max(eval(p.values)[-1]))
model <- update(model, as.formula(paste("~.-", exclude)), data = dat)
if (length(model$coefficients) == 1) { # only intercept remains
message("No significant variables left in the model.")
break
} # end if length
} # end while

n.vars.trim <- length(model$coefficients) - 1
excluded.vars <- setdiff(names.vars.start, names(model$coefficients)[-1])
message(n.vars.start - n.vars.trim, " variable(s) excluded by 'modelTrim' function\n ", paste(excluded.vars, collapse = ", "), "\n\n")

return(model)
}

If you have a model object (resulting e.g. from the glm function) called, for example, mod.sp1, load the modelTrim function and then just type

mod.sp1 <- modelTrim(mod.sp1, method = “summary”)

summary(mod.sp1)

An option to trim the models using this function is now also included as an option in the multGLM function.

References

Barbosa A.M. (2015) fuzzySim: applying fuzzy logic to binary similarity indices in ecology. Methods in Ecology and Evolution, 6: 853-858

Barbosa A.M. & Real R. (2010) Favourable areas for expansion and reintroduction of Iberian lynx accounting for distribution trends and genetic diversity of the European rabbit. Wildlife Biology in Practice 6: 34-47

Barbosa A.M. & Real R. (2012) Applying fuzzy logic to comparative distribution modelling: a case study with two sympatric amphibians. The Scientific World Journal, Article ID 428206

Crawley M.J. (2007) The R Book. John Wiley & Sons, Chichester (UK)

Estrada A. & Arroyo B. (2012) Occurrence vs abundance models: Differences between species with varying aggregation patternsBiological Conservation, 152: 37-45

Hosmer D. W. & Lemeshow S. (2000) Applied Logistic Regression (2nd ed). John Wiley and Sons, New York

GLMs with multiple species and options

The multGLM function, now included in the fuzzySim package (Barbosa, 2015), automatically calculates binomial GLMs for one or more species (or other binary variables) in a data frame. The function can optionally perform stepwise variable selection (and it does so by default) instead of forcing all variables into the models, starting from either the null model (the default, so selection starts forward) or from the full model (so selection starts backward) and using either AIC (Akaike’s Information Criterion) or BIC (Bayesian Information Criterion, also known as Schwarz criterion, SBC or SBIC) as a variable selection criterion. Instead or subsequently, it can also perform stepwise removal of non-significant variables from the models using the modelTrim function. There is also an optional preliminary selection of variables with a significant/informative bivariate relationship with the response, based on the false discovery rate (with different methods also available), for which you need the FDR function (but note that some variables may be significant and informative in a multivariate model even if they would not have been selected by FDR). It can also do a preliminary selection of non-correlated variables, which requires the corSelect function, available in fuzzySim since version 1.5. If you want favourability (F) to be calculated (as it is by default), you need the Fav function as well. By default, all data are used in model training, but you can define an optional test.sample to be reserved for model testing afterwards.

multGLM <- function (data, sp.cols, var.cols, id.col = NULL,
family = "binomial", test.sample = 0, FDR = FALSE, correction = "fdr", corSelect = FALSE, cor.thresh = 0.8, step = TRUE, trace = 0, start = "null.model", direction = "both", select = "AIC", trim = TRUE, Y.prediction = FALSE, P.prediction = TRUE, Favourability = TRUE, group.preds = TRUE, 
TSA = FALSE, coord.cols = NULL, degree = 3, verbosity = 2, ...) 
{
    # version 5.1 (6 Jan 2020)
    start.time <- Sys.time()
    on.exit(timer(start.time))
    input.ncol <- ncol(data)
    input.names <- colnames(data)
    stopifnot(as.vector(na.omit(as.matrix(data[, sp.cols]))) %in% 
        c(0, 1), all(sp.cols %in% (1:input.ncol)) || all(sp.cols %in% 
        input.names), all(var.cols %in% (1:input.ncol)) || all(var.cols %in% 
        input.names), is.null(id.col) || (length(id.col) == 1 && 
        (id.col %in% (1:input.ncol) || id.col %in% input.names)), 
        is.null(coord.cols) || all(coord.cols %in% (1:input.ncol)) || 
            all(coord.cols %in% input.names), family == "binomial", 
        test.sample >= 0 | test.sample == "Huberty", length(test.sample) == 
            1 | (is.integer(test.sample) & test.sample > 0), 
        length(test.sample) < nrow(data), is.logical(FDR), is.logical(step), 
        start %in% c("null.model", "full.model"), direction %in% 
            c("both", "backward", "forward"), select %in% c("AIC", 
            "BIC"), is.logical(Y.prediction), is.logical(P.prediction), 
        is.logical(Favourability), is.logical(group.preds), is.logical(trim), 
        is.logical(TSA), !Favourability | exists("Fav"), !trim | 
            exists("modelTrim"))
    data$sample <- "train"
    n <- nrow(data)
    data.row <- 1:n
    test.sample.input <- test.sample
    if (length(test.sample) == 1) {
        if (test.sample == "Huberty") {
            if (!FDR & !step & !trim) {
                test.sample <- percentTestData(length(var.cols))/100
                n.test <- round(n * test.sample)
                if (verbosity > 0) 
                  message("Following Huberty's rule, ", test.sample * 
                    100, "% of observations\n          (", n.test, 
                    " out of ", n, ") set aside for model testing; ", 
                    n - n.test, " observations used for model training.")
            }
            else stop("Sorry, Huberty's rule cannot be used with 'FDR', 'step' or 'trim',\n                   as these make the number of variables vary.\n                   Set these 3 parameters to FALSE,\n                   or use a different 'test.sample' option.")
        }
        else if (test.sample == 0) {
            if (verbosity > 0) 
                message("All ", n, " observations used for model training;\n              none reserved for model testing.")
            n.test <- 0
        }
        else if (test.sample < 1) {
            n.test <- round(n * test.sample)
            if (verbosity > 0) 
                message(test.sample * 100, "% of observations (", 
                  n.test, " out of ", n, ") set aside for model testing; ", 
                  n - n.test, " observations used for model training.")
        }
        else if (test.sample >= 1) {
            n.test <- test.sample
            if (verbosity > 0) 
                message(n.test, " (out of ", n, ") observations set aside for model testing; ", 
                  n - n.test, " observations used for model training.")
        }
        test.sample <- sample(data.row, size = n.test, replace = FALSE)
    }
    else if (length(test.sample) > 1) {
        n.test <- length(test.sample)
        if (verbosity > 0) 
            message(n.test, " (out of ", n, ") observations set aside for model testing; ", 
                n - n.test, " observations used for model training.")
    }
    data$sample[data.row %in% test.sample] <- "test"
    train.data <- data[data$sample == "train", ]
    if (Favourability) {
        if (family != "binomial") {
            Favourability <- FALSE
            warning("Favourability is only applicable to binomial responses,\n               so it could not be calculated")
        }
    }
    if (!is.numeric(sp.cols)) 
        sp.cols <- which(colnames(data) %in% sp.cols)
    if (!is.numeric(var.cols)) 
        var.cols <- which(colnames(data) %in% var.cols)
    if (!is.null(coord.cols) && !is.numeric(coord.cols)) 
        coord.cols <- which(colnames(data) %in% coord.cols)
    if (!is.null(id.col) && !is.numeric(id.col)) 
        id.col <- which(colnames(data) %in% id.col)
    keeP <- P.prediction
    if (Favourability) 
        P.prediction <- TRUE
    n.models <- length(sp.cols)
    n.preds <- n.models * (Y.prediction + P.prediction + Favourability)
    models <- vector("list", n.models)
    predictions <- matrix(NA, nrow = nrow(data), ncol = n.preds)
    colnames(predictions) <- rep("", n.preds)
    model.count <- 0
    pred.count <- 0
    for (s in sp.cols) {
        model.count <- model.count + 1
        response <- colnames(train.data)[s]
        if (verbosity > 0) 
            cat("\n\n=> Building model ", model.count, " of ", 
                n.models, " (", response, ")...\n\n", sep = "")
        if (verbosity > 1) 
            cat(length(var.cols), "input predictor variable(s)\n\n")
        if (TSA) {
            if (verbosity > 1) 
                cat("...plus the spatial trend variable\n\n")
            tsa <- suppressMessages(multTSA(data = data, sp.cols = s, 
                coord.cols = coord.cols, degree = degree, type = "Y"))
            tsa_name <- paste("sptrend", response, sep = "_")
            data <- data.frame(data, tsa[, ncol(tsa)])
            names(data)[ncol(data)] <- tsa_name
            train.data <- data.frame(train.data, tsa[which(data$sample == 
                "train"), ncol(tsa)])
            names(train.data)[ncol(train.data)] <- tsa_name
            var.cols <- c(var.cols, ncol(train.data))
        }
        if (FDR) {
            fdr <- FDR(data = train.data, sp.cols = s, var.cols = var.cols, 
                correction = correction, verbose = FALSE)
            if (nrow(fdr$select) == 0) {
                message(paste0("No variables passed the FDR test (so no variables included in the model)\n for '", 
                  response, "'. Consider using 'FDR = FALSE' or choosing a less stringent 'correction' procedure."))
            }
            if (verbosity > 1) 
                cat(length(var.cols) - nrow(fdr$select), "variable(s) excluded by 'FDR' function\n", 
                  paste(row.names(fdr$exclude), collapse = ", "), 
                  "\n\n")
            sel.var.cols <- which(colnames(train.data) %in% rownames(fdr$select))
        }
        else sel.var.cols <- var.cols
        if (length(sel.var.cols) > 1 && corSelect == TRUE) {
            corselect <- suppressMessages(corSelect(data = train.data, 
                sp.cols = s, var.cols = sel.var.cols, cor.thresh = cor.thresh, 
                use = "pairwise.complete.obs"))
            corsel.var.cols <- corselect$selected.var.cols
            if (verbosity > 1) 
                cat(length(sel.var.cols) - length(corsel.var.cols), 
                  "variable(s) excluded by 'corSelect' function\n", 
                  corselect$excluded.vars, "\n\n")
            sel.var.cols <- corsel.var.cols
        }
        if (length(sel.var.cols) == 0) 
            model.vars <- 1
        else model.vars <- colnames(train.data)[sel.var.cols]
        model.formula <- with(train.data, as.formula(paste(response, 
            "~", paste(model.vars, collapse = "+"))))
        model.expr <- expression(glm(model.formula, family = binomial))
        if (step && length(sel.var.cols) > 0) {
            n.vars.start <- length(sel.var.cols)
            if (select == "AIC") 
                K <- 2
            else if (select == "BIC") 
                K <- log(n)
            if (start == "full.model") {
                model <- step(eval(model.expr), direction = direction, 
                  trace = trace, k = K)
            }
            else if (start == "null.model") {
                model.scope <- model.formula[-2]
                null.formula <- as.formula(paste(response, "~", 
                  1))
                model <- step(glm(null.formula, family = binomial, 
                  data = train.data), direction = direction, 
                  scope = model.scope, trace = trace, k = K)
            }
            else stop("'start' must be either 'full.model' or 'null.model'")
            n.vars.step <- length(model$coefficients) - 1
            excluded.vars <- setdiff(colnames(data[, sel.var.cols]), 
                names(model$coefficients)[-1])
            if (verbosity > 1) 
                cat(n.vars.start - n.vars.step, "variable(s) excluded by 'step' function\n", 
                  paste(excluded.vars, collapse = ", "), "\n\n")
        }
        else model <- eval(model.expr, envir = train.data)
        if (trim && length(sel.var.cols) > 0) {
            n.vars.start <- length(model$coefficients) - 1
            names.vars.start <- names(model$coefficients)[-1]
            model <- suppressMessages(modelTrim(model, ...))
            n.vars.trim <- length(model$coefficients) - 1
            excluded.vars <- setdiff(names.vars.start, names(model$coefficients)[-1])
            if (verbosity > 1) 
                cat(n.vars.start - n.vars.trim, "variable(s) excluded by 'modelTrim' function\n", 
                  paste(excluded.vars, collapse = ", "), "\n\n")
        }
        if (step || trim) {
            sel.var.names <- names(model$coefficients)[-1]
            if (verbosity > 1) 
                cat(length(sel.var.names), "variable(s) INCLUDED IN THE FINAL MODEL\n", 
                  paste(sel.var.names, collapse = ", "))
        }
        if (verbosity > 1) 
            cat("\n\n")
        models[[model.count]] <- model
        names(models)[[model.count]] <- response
        if (Y.prediction) {
            pred.count <- pred.count + 1
            colnames(predictions)[pred.count] <- paste(response, 
                "Y", sep = "_")
            predictions[, pred.count] <- predict(model, data)
        }
        if (P.prediction) {
            pred.count <- pred.count + 1
            colnames(predictions)[pred.count] <- paste(response, 
                "P", sep = "_")
            predictions[, pred.count] <- predict(model, data, 
                type = "response")
        }
        if (Favourability) {
            n1 <- sum(train.data[, s] == 1, na.rm = TRUE)
            n0 <- sum(train.data[, s] == 0, na.rm = TRUE)
            pred.count <- pred.count + 1
            predictions[, pred.count] <- Fav(n1n0 = c(n1, n0), 
                pred = predictions[, pred.count - 1])
            colnames(predictions)[pred.count] <- paste(response, 
                "F", sep = "_")
        }
        if (TSA) {
            train.data <- train.data[, -ncol(train.data)]
            data <- data[, -ncol(data)]
            var.cols <- var.cols[-length(var.cols)]
        }
    }
    if (P.prediction && !keeP) {
        n.char <- nchar(colnames(predictions))
        pred.suffix <- substr(colnames(predictions), n.char - 
            1, n.char)
        P.cols <- grep("_P", pred.suffix)
        predictions <- predictions[, -P.cols]
    }
    n.pred.types <- sum(Y.prediction, keeP, Favourability)
    if (n.pred.types == 0) {
        predictions <- data.frame()
    }
    else {
        predictions <- data.frame(data[, id.col], sample = data[, 
            "sample"], predictions)
        if (n.pred.types == 1 || length(sp.cols) == 1) 
            group.preds <- FALSE
        if (group.preds) {
            first.pred.col <- ifelse(is.null(id.col), 2, 3)
            pred1.cols <- seq(first.pred.col, ncol(predictions), 
                by = n.pred.types)
            pred2.cols <- seq(first.pred.col + 1, ncol(predictions), 
                by = n.pred.types)
            pred3.cols <- NULL
            if (n.pred.types == 3) {
                pred3.cols <- seq(first.pred.col + 2, ncol(predictions), 
                  by = n.pred.types)
            }
            predictions <- data.frame(data[, id.col], sample = data$sample, 
                predictions[, pred1.cols], predictions[, pred2.cols], 
                predictions[, pred3.cols])
        }
        if (!is.null(id.col)) {
            colnames(predictions)[1] <- input.names[id.col]
        }
    }
    if (test.sample.input == 0) {
        predictions <- predictions[, -match("sample", colnames(predictions))]
    }
    if (!is.null(id.col)) {
        colnames(predictions)[1] <- colnames(data)[id.col]
    }
    selected_variables <- lapply(models, function(x) names(x$model)[-1])
    message("Finished!")
    return(list(predictions = predictions, models = models, variables = selected_variables))
}

REFERENCES:

Barbosa A.M. (2015) fuzzySim: applying fuzzy logic to binary similarity indices in ecology. Methods in Ecology and Evolution, 6: 853-858

Overlap Analysis

Overlap Analysis is one of the simplest forms of modelling species’ distributions. It assesses the ranges of values of the given predictor variables at the sites where a species has been recorded present, and predicts where that species should be able to occur based on those presence data (e.g. Brito et al. 1999, Arntzen & Teixeira 2006).

OA, now included in the modEvA package (Barbosa et al. 2014), can also be useful when extrapolating models outside their original scope (geographical area, time period or spatial resolution), as it can identify which localities are within the model’s domain — i.e., within the analysed ranges of values of the variables, outside which the model may not be reliable (e.g. Barbosa et al. 2009). In this case, the sp.cols should contain not species’ presence/absence, but rather indicate (with value 1) all observations that have been included in the model. Cf. modEvA function MESS, which is more informative but also more computationally intensive.

OA <-
function(data, sp.cols, var.cols) {
  # version 2.1 (13 May 2014)
 
  if (length(sp.cols) > 1) stop ("Sorry, OA is currently implemented for only one response variable at a time, so 'sp.cols' must indicate only one column")
 
  input.data <- data
 
  na.rm = TRUE
  if(na.rm) {
    mod.data <- data[ , c(sp.cols, var.cols)]
    data <- data[complete.cases(mod.data), ]
  }
 
  predictors <- data[ , var.cols]
  nvar <- ncol(predictors)
 
  response <- data[ , sp.cols]
  predictors.presence <- predictors[response > 0, ]
  var.presmin <- var.presmax <- vector("numeric", nvar)
  predictors.overlap <- matrix(data = NA, nrow = nrow(predictors), ncol = nvar)
  for (v in 1:nvar) {
    var.presmin[v] <- min(predictors.presence[ , v])
    var.presmax[v] <- max(predictors.presence[ , v])
    predictors.overlap[ , v] <- ifelse((predictors[ , v] >= var.presmin[v] & predictors[ , v] <= var.presmax[v]), 1, 0)
  }  # end for v
  overlap.noNA <- as.integer(rowSums(predictors.overlap) == nvar)
  return(overlap.noNA)
}

[presented with Pretty R]

NOTE that the input data format for this function has changed in mid May 2014, with former parameters response and predictors now replaced with data, sp.cols and var.cols (for coherence with multGLM and other functions in modEvA). Input data for the OA function are now a matrix or data frame containing the response variable (presences vs. absences of a species if we want to model its occurrence, or modelled vs. non-modelled sites if we want to know which non-modelled sites are within the modelled range), and the predictor variables to consider. The output is a binary vector whith 1 where the values of all predictors lie within the ranges observed for the presence records, and 0 otherwise. For example, if you have a table called mydata with your species’ presence data in the first column and your predictor variables in columns 2 to 12, paste in R the OA function followed by this command:

OA(data = mydata, sp.cols = 1, var.cols = 2:12)

References

Arntzen JW, Teixeira J. 2006. History and new developments in the mapping and modelling of the distribution of the golden-striped salamander, Chioglossa lusitanica. Zeitschrift für Feldherpetologie, Supplement: 1-14.

Barbosa A.M., Brown J.A. & Real R. (2014) modEvA – an R package for model evaluation and analysis. R package, version 0.1.

Barbosa, A.M., Real, R. & Vargas, J.M. (2009) Transferability of environmental favourability models in geographic space: the case of the Iberian desman (Galemys pyrenaicus) in Portugal and Spain. Ecological Modelling 220: 747–754.

Brito JC, Crespo EG, Paulo OS 1999. Modelling wildlife distributions: Logistic Multiple Regression vs Overlap Analysis. Ecography 22: 251-260.

Variation partitioning

The varPart function, now included in the modEvA package (Barbosa et al. 2014), performs variation partitioning (Borcard et al. 1992) among two factors (e.g. Ribas et al. 2006) or three factors (e.g. Real et al. 2003) for either multiple linear regression models (LM) or generalized linear models (GLM).

If you have linear models, input data for varPart are the coefficients of determination (R-squared values) of the linear regressions of the target variable on all the variables in the model, on the variables related to each particular factor, and (when there are 3 factors) on the variables related to each pair of factors. The outputs are the amounts of variance explained exclusively by each factor, the amounts explained exclusively by the overlapping effects of each pair of factors, and the amount explained by the overlap of the 3 factors if this is the case (e.g. Real et al. 2003). The amount of variation not explained by the complete model is also provided.

If you have generalized linear models (GLMs) such as logistic regression or the favourability function, you have no true R-squared values from these models; inputs can be squared coefficients of (Spearman’s) correlation between the model predictions given by each factor or pair of factors and the predictions of the complete model (e.g. Muñoz & Real 2006), or the R-squared values of the corresponding logit (y) functions (Real et al. 2013), or an adjusted R-squared (De Araújo et al. 2014). Consequently, output values are not the total amounts of variance (of the target variable) explained by factors and overlaps, but rather their proportional contribution to the total variation explained by the model. The amount of unexplained variation is not available for GLMs.

varPart <- function(A, B, C = NA, AB, AC = NA, BC = NA, ABC = NA,
         model.type = NULL, A.name = "Factor A", B.name = "Factor B", 
         C.name = "Factor C", plot = TRUE, plot.digits = 3, cex.names = 1.5, 
         cex.values = 1.2, main = "", cex.main = 2, plot.unexpl = TRUE) {
 
  # version 1.7 (17 Mar 2017)
 
  if (!is.null(model.type)) message ("NOTE: Argument 'model.type' is no longer used.")
 
  partials c(A, B, C, AB, BC, AC, ABC)
  if (is.finite(partials[c(1:2, 4)]) && is.na(partials[c(3, 5:7)]))  
    twofactors TRUE
  else if (all(is.finite(partials)))  
    twofactors FALSE
  else stop ("You must provide numeric values for either A, B and AB (for variation partitioning among two factors) or A, B, C, AB, BC, AC and ABC (for variation partitioning among three factors). See Details.")
 
  if (!all(na.omit(partials) >= 0 & na.omit(partials) <= 1)) stop ("Values must be between 0 and 1.")
 
  totalexpl ifelse(twofactors, AB, ABC)
  unexpl 1 - totalexpl
 
  if (twofactors) {
    Apure c(paste("Pure", A.name), paste("Pure", B.name),
                      paste0("Pure ", A.name, "_", B.name, " overlap"),
                      "Unexplained")
    results data.frame(c(Apure, Bpure, ABoverlap, unexpl),
                          row.names = output.names)
 
  } else { # end if 2 factors
 
    Apure C
    BCoverlap c(paste("Pure", A.name),
                      paste("Pure", B.name),
                      paste("Pure", C.name),
                      paste0("Pure ", A.name, "_", B.name, " overlap"),
                      paste0("Pure ", B.name, "_", C.name, " overlap"),
                      paste0("Pure ", A.name,"_", C.name," overlap"),
                      paste0(A.name,"_",B.name,"_",C.name," overlap"),
                      "Unexplained")
    results data.frame(c(Apure, Bpure, Cpure, ABoverlap, BCoverlap,
                            ACoverlap, ABCoverlap, unexpl),
                          row.names = output.names)
  }  # end else
 
  colnames(results) "Proportion"
 
  if (plot) {  # adapted from Daniel's http://stackoverflow.com/questions/1428946/venn-diagrams-with-r
 
    circle function(x, y, r) {
      ang seq(0, 2*pi, length = 100)
      xx cos(ang)
      yy sin(ang)
      polygon(xx, yy)
    }  # end circle funtion (by Daniel)
 
    Apure round(Apure, plot.digits)  # shorten values for plotting
    Bpure round(Bpure, plot.digits)
    ABoverlap round(ABoverlap, plot.digits)
    if(!twofactors) {
      Cpure round(Cpure, plot.digits)
      BCoverlap round(BCoverlap, plot.digits)
      ACoverlap round(ACoverlap, plot.digits)
      ABCoverlap round(ABCoverlap, plot.digits)
    }
 
    if (twofactors) {
      plot(0, 0, ylim = c(-1, 10), xlim = c(-1, 7), type = "n", axes = FALSE,
           ylab = "", xlab = "", main = main, cex.main = cex.main)
      circle(3,3,3)
      circle(3,6,3)
      text(x = c(3, 3), y = c(9.5, -0.5), labels = c(A.name, B.name),
           cex = cex.names)
      text(x = c(3, 3, 3), y = c(7, 4.75, 2), c(Apure, ABoverlap, Bpure),
           cex = cex.values)
 
    } else { # end if 2 factors
 
      plot(0, 0, ylim = c(-1, 10), xlim = c(-1, 10), type = "n", axes = FALSE,
           ylab = "", xlab = "", main = main, cex.main = cex.main)
      circle(3, 6, 3)
      circle(6, 6, 3)
      circle(4.5, 3,  3)
      text(x = c(2.5, 6.5, 4.5), y = c(9.5, 9.5, -0.5),
           labels = c(A.name, B.name, C.name), cex = cex.names, adj = c(0.5, 0.5, 0))
      text(x = c(1.8, 7.2, 4.5, 4.5, 2.8, 6.2, 4.5), y = c(6.6, 6.6, 2, 7, 4, 4, 5), labels = c(Apure, Bpure, Cpure, ABoverlap, ACoverlap, BCoverlap, ABCoverlap), cex = cex.values)
    } # end if 2 factors else
 
    if (plot.unexpl)  {
      rect(-1, -1, 10, 10)
      text(x = -0.9, y = -0.2, label = paste0("Unexplained\n", round(unexpl, plot.digits)), adj = 0, cex = cex.values)
    }
 
  }  # end if plot
  results
}

[presented with Pretty R]

You just need to copy the function above, paste it into R and press enter. An example of how to use it then is provided below. The results are returned numerically and also plotted in a diagram:

# if you have a linear model (LM), use (non-adjusted) R-squared values 
# for each factor and for their combinations as inputs:
 
varPart(A = 0.456, B = 0.315, C = 0.281, AB = 0.051, BC = 0.444, 
AC = 0.569, ABC = 0.624, A.name = "Spatial", B.name = "Human", 
C.name = "Environmental", main = "Small whale")
 
varPart# if you have a generalized linear model (GLM), 
# you can use squared correlation coefficients 
# of the predictions of each factor with those of the complete model:
 
varPart(A = (-0.005)^2, B = 0.698^2, C = 0.922^2, AB = 0.696^2, 
BC = 0.994^2, AC = 0.953^2, ABC = 1, A.name = "Topographic", 
B.name = "Climatic", C.name = "Geographic", main = "Big bird", 
plot.unexpl = FALSE)

varPart_GLMYou can change the number of digits in the plotted values with the argument plot.digits. You can also change the character sizes of the plot’s main title, circle names, and values, using arguments cex.main, cex.names and cex.values. Note that these results derive from arithmetic operations between your input values, and they always sum up to 1; if your input is incorrect, the results will be incorrect as well.

References

Barbosa A.M., Brown J.A., Jiménez-Valverde & Real R. (2014) modEvA – an R package for model evaluation and analysis. R package, version 0.1.

Borcard D, Legendre P, Drapeau P (1992) Partialling out the spatial component of ecological variation. Ecology 73: 1045-1055

De Araújo, C.B., Marcondes-Machado, L.O. & Costa, G.C. (2014) The importance of biotic interactions in species distribution models: a test of the Eltonian noise hypothesis using parrots. Journal of Biogeography 41: 513-523 (DOI: 10.1111/jbi.12234)

Muñoz, A.-R. & Real, R. (2006) Assessing the potential range expansion of the exotic monk parakeet in Spain. Diversity and Distributions 12: 656–665.

Real, R., Barbosa, A.M., Porras, D., Kin, M.S., Márquez, A.L., Guerrero, J.C., Palomo, L.J., Justo, E.R. & Vargas, J.M. (2003) Relative importance of environment, human activity and spatial situation in determining the distribution of terrestrial mammal diversity in Argentina. Journal of Biogeography 30: 939-947.

Real R., Romero D., Olivero J., Estrada A. & Márquez A.L. (2013) Estimating how inflated or obscured effects of climate affect forecasted species distribution. PLoS ONE 8: e53646. doi:10.1371/journal.pone.0053646

Ribas, A., Barbosa, A.M., Casanova, J.C., Real, R., Feliu, C. & Vargas, J.M. (2006) Geographical patterns of the species richness of helminth parasites of moles (Talpa spp.) in Spain: separating the effect of sampling effort from those of other conditioning factors. Vie et Milieu 56: 1-8.

Favourability: a GLM for prevalence-independent modelling of binary responses

Logistic regression (Generalised Linear Model with binomial error distribution and a logit link) is widely used for modelling species’ potential distributions using presence/absence data and a set of categorical or continuous predictor variables. However, this GLM incorporates the prevalence (relative proportion of presences and absences) of the species in the training sample, which affects the probability values produced.

Barbosa (2006) and Real, Barbosa & Vargas (2006) proposed an environmental favourability function which is based on logistic regression but cancels out uneven proportions of presences and absences in the modelled data. Favourability thus assesses the extent to which the environmental conditions change the probability of occurrence of a species with respect to its overall prevalence in the study area. Model predictions can, therefore, be directly compared between species with different prevalences. The favourability function has been implemented in SAM software and is now also included in the fuzzySim R package (Barbosa, 2015).

Using simulated data, Albert & Thuiller (2008) proposed a modification to the favourability function, but it requires knowing the true prevalence of the species (not just the prevalence in the studied sample), which is rarely possible in real-world modelling. Besides, this suggestion was based on the misunderstanding that the favourability function was a way to obtain the probability of occurrence when prevalence differs from 50%, which is incorrect (see Acevedo & Real 2012).

To get environmental favourability in R with either the Real, Barbosa & Vargas (“RBV”) or the Albert & Thuiller (“AT”) method, you just need to get a probabilistic model (e.g. logistic regression) from your data and then use the Fav function given below. Input data for this function are either a model object resulting from the glm function, or the presences-absences (1-0) of your species and the corresponding presence probability values, obtained e.g. with predict(mymodel, mydata, type = “response”). Alternatively to the presences-absences, you can provide the sample prevalence or the numbers of presences and absences. In case you want to use the “AT” method, you also need to provide the true (absolute) prevalence of your species.

Fav <- function(obs = NULL, pred = NULL, n1n0 = NULL, model = NULL, sample.preval = NULL, method = "RBV", true.preval = NULL) {
  # version 1.2 (25 Mar 2014)
  # obs: a vector of 1-0 values of a modelled binary variable
  # pred: a vector of predicted probability values given e.g. by logistic regression (note: non-probabilistic predicted values are not suitable for the Fav function!)
  # n1n0: alternatively to obs, the total number of ones and zeros, in this order -- e.g. n1n0 = c(214, 317); ignored if the 'obs' vector is provided
  # sample.preval: alternatively to obs or n1n0, the prevalence (proportion of positive cases) of the modelled binary variable in the modelled data
  # model: alternatively to all of the above, a model object of class "glm" (if provided, will override any values provided in the arguments described above)
  # method: either "RBV" for the original Real, Barbosa & Vargas (2006) procedure, or AT for the modification proposed by Albert & Thuiller (2008) (but see Acevedo & Real 2012, Naturwissenschaften 99:515-522)
  # true.preval: the true prevalence (as opposed to sample prevalence); necessary if you want to use the AT method
 
  if (!is.null(model)) {
    if (!is.null(pred)) message("Argument 'pred' ignored in favour of 'model'.")
    if (!is.null(obs)) message("Argument 'obs' ignored in favour of 'model'.")
    if (!is.null(n1n0)) message("Argument 'n1n0' ignored in favour of 'model'.")
    if (!is.null(sample.preval)) message("Argument 'sample.preval' ignored in favour of 'model'.")
    obs <- model$y
    pred <- model$fitted.values
  }  # end if model
 
  if(!is.null(obs) & !is.null(pred) & length(obs) != length(pred)) {
    stop("'obs' and 'pred' must have the same length (and be in the same order).")
  }
 
  if (!is.null(obs)) {
    n1 <- sum(obs == 1, na.rm = TRUE)
    n0 <- sum(obs == 0, na.rm = TRUE)
  }  # end if obs
 
  else if (!is.null(n1n0)) {
    if(!is.null(model)) message("Argument 'n1n0' ignored in favour of 'model'")
    else if (!is.null(obs)) message("Argument 'n1n0' ignored in favour of 'obs'")
    n1 <- n1n0[1]
    n0 <- n1n0[2]
    #if(n1 + n0 != length(pred)) stop("n1+n0 must equal the length of 'pred'.")
  }  # end if n1n0
 
  else if (!is.null(sample.preval)) {
    if(!is.null(model)) message("Argument 'sample.preval' ignored in favour of 'model'")
    else if (!is.null(obs)) message("Argument 'sample.preval' ignored in favour of 'obs'")
    else if (!is.null(n1n0)) message("Argument 'sample.preval' ignored in favour of 'n1n0'")
    n1 <- sample.preval * 100
    n0 <- 100 - n1
  }  # end if sample.preval
 
  else stop("You need to provide either 'model', or 'obs' plus either one of 'pred', 'n1n0' or 'sample.preval'.")
 
  if(method == "RBV") {  # Real, Barbosa & Vargas 2006
    fav <- (pred / (1 - pred)) / ((n1 / n0) + (pred / (1 - pred)))
  }
 
  else if(method == "AT") {  # Albert & Thuiller 2008; but see Acevedo & Real 2012
    sample.preval <- n1 / (n1 + n0)
    fav <- (pred / (1 - pred)) / ((sample.preval / true.preval) + (pred / (1 - pred)))
  }
 
  else stop("method must be either 'RBV' or 'AT'")
 
  return(fav)
} # end Fav function

[presented with Pretty R]

Usage examples:

Fav(model = mymodel)

Fav(obs = mydata$myspecies, pred = mydata$myspecies.P)

Fav(pred = mydata$myspecies.P, sample.preval = 0.42)

Fav(pred = mydata$myspecies.P, n1n0 = c(824, 1430))

References:

Acevedo P. & Real R. (2012) Favourability: concept, distinctive characteristics and potential usefulness. Naturwissenschaften 99: 515-522

Albert C.H. & Thuiller W. (2008) Favourability functions versus probability of presence: advantages and misusesEcography 31: 417-422.

Barbosa, A.M.E. (2006) Modelación de relaciones biogeográficas entre predadores, presas y parásitos: implicaciones para la conservación de mamíferos en la Península Ibérica. PhD Thesis, University of Málaga (Spain). [PDF available here]

Barbosa A.M. (2015) fuzzySim: applying fuzzy logic to binary similarity indices in ecology. Methods in Ecology and Evolution, 6: 853-858

Real R., Barbosa A.M. & Vargas J.M. (2006) Obtaining environmental favourability functions from logistic regressionEnvironmental and Ecological Statistics 13: 237-245.

False Discovery Rate (FDR)

It is common in ecology to search for statistical relationships between species’ occurrence and a set of predictor variables. However, when a large number of variables is analysed (compared to the number of observations), false findings may arise due to repeated testing. García (2003) recommended controlling the false discovery rate (FDR; Benjamini and Hochberg 1995) in ecological studies. Benjamini & Yekutieli (2001) adapted it to allow for dependency among tests statstics. The p.adjust function in R performs this and other corrections to the significance (p) values of variables under repeated testing. The FDR function, now included in the fuzzySim package (Barbosa, 2015), performs bivariate regressions (either linear or binary logistic) of a set of predictor variables on one response variable, or uses already-obtained p values for a set of predictor variables. It then calculates the FDR with p.adjust, and shows which variables should be retained for or excluded from further multivariate analysis according to their corrected p values (see, for example, Barbosa, Real & Vargas 2009). Besides the p-values, this function also reports the individual AIC and BIC of each variable.

The FDR function uses the Benjamini & Hochberg (“BH”) correction by default, but check the documentation of the p.adjust function for other available methods, namely “BY”, which allows for non-independent data (Benjamini & Yekutieli, 2001) and which is the default e.g. in function multGLM. Input data may be a matrix or data frame containing the response variable (for example, the presence-absence or abundance of a species) and the predictor variables; alternatively, you may already have performed the bivariate regressions and have a set of variables and corresponding p values which you want to correct with FDR — in this case, get a data frame with your variables’ names in the first column and their p values in the second column, and supply it as the pvalues argument to the FDR function (no need to provide data, sp.cols or var.cols in this case). NOTE that the input data format for this function has changed in mid May 2014, with former parameters response and predictors now replaced with data, sp.cols and var.cols (for coherence and compatibility with multGLM). Note also that, as of late October 2015, argument model.type is deprecated, as this information is included in argument family – e.g., if you want linear models (LM), you can set family = ‘gaussian’. You can let the default family = ‘auto’ figure out the family automatically.

function (data = NULL, sp.cols = NULL, var.cols = NULL, pvalues = NULL, 
    model.type = NULL, family = "auto", correction = "fdr", 
    q = 0.05, verbose = TRUE, simplif = FALSE) {
    # version 3.6 (26 Apr 2016)
    if (length(sp.cols) > 1) 
        stop("Sorry, FDR is currently implemented for only one response variable at a time, so 'sp.cols' must indicate only one column")
    if (!is.null(model.type)) 
        warning("Argument 'model.type' is deprecated and now ignored, as this info is included in 'family' (e.g. 'gaussian' for LM, 'binomial' or 'poisson' for GLM).")
    model.type <- "GLM"
    n.init <- nrow(data)
    data <- data[is.finite(data[, sp.cols]), ]
    na.loss <- n.init - nrow(data)
    if (na.loss > 0) 
        message(na.loss, " cases excluded due to missing or non-finite values.")
    if (family == "auto") {
        if (all(data[, sp.cols] %in% c(0, 1))) 
            family <- "binomial"
        else if (all(data[, sp.cols] >= 0 && data[, sp.cols]%%1 == 
            0)) 
            family <- "poisson"
        else family <- "gaussian"
        if (verbose) 
            message("\nUsing generalized linear models of family '", 
                family, "'.\n")
    }
    if (!(correction %in% p.adjust.methods)) 
        stop("Invalid correction method.\nType 'p.adjust.methods' for available options.")
    response <- data[, sp.cols]
    predictors <- data[, var.cols]
    if (!is.null(pvalues)) {
        coeffs <- aic <- bic <- FALSE
        p.bivar <- pvalues[, 2]
        names(p.bivar) <- pvalues[, 1]
    }
    else {
        coeffs <- aic <- bic <- TRUE
        if (is.null(ncol(predictors))) 
            stop("You need more than one predictor to calculate the FDR.")
        p.bivar <- coef.bivar <- aic.bivar <- bic.bivar <- vector("numeric", 
            length = ncol(predictors))
        for (i in 1:length(p.bivar)) {
            model <- glm(response ~ predictors[, i], family = family)
            p.bivar[i] <- anova(model, test = "Chi")[, "Pr(>Chi)"][2]
            coef.bivar[i] <- model[["coefficients"]][2]
            aic.bivar[i] <- extractAIC(model, k = 2)[2]
            bic.bivar[i] <- extractAIC(model, k = log(nobs(model)))[2]
            if (is.na(p.bivar[i])) 
                message("A p-value could not be calculated for var.col number", 
                  i)
            if (is.na(aic.bivar[i])) 
                message("AIC could not be calculated for var.col number", 
                  i)
            if (is.na(aic.bivar[i])) 
                message("BIC could not be calculated for var.col number", 
                  i)
        }
        rm(i)
    }
    if (coeffs) {
        results <- data.frame(cbind(coef.bivar, aic.bivar, bic.bivar, 
            p.bivar), row.names = names(predictors))
        names(results) <- c("bivariate.coeff", "AIC", "BIC", 
            "p.value")
        results <- results[order(results[, "p.value"]), ]
        results[, "p.adjusted"] <- p.adjust(results[, "p.value"], 
            method = correction)
    }
    else {
        results <- data.frame(AIC = aic.bivar, BIC = bic.bivar, 
            p.value = p.bivar, row.names = pvalues[, 1])
        results <- results[order(results[, "p.value"]), ]
        results[, "p.adjusted"] <- p.adjust(results[, "p.value"], 
            method = correction)
    }
    p.adjusted <- NULL
    if (simplif) 
        return(results)
    exclude <- subset(results, p.adjusted > q)
    select <- subset(results, p.adjusted <= q)
    if (verbose) {
        message("Bivariate p-values adjusted with '", correction, 
            "' correction;\n", nrow(exclude), " variable(s) excluded, ", 
            nrow(select), " selected (with q = ", q, ")\n")
    }
    list(exclude = exclude, select = select)
}

[presented with Pretty R]

Usage example:

FDR(data = mydata, sp.cols= 2, var.cols = 5:17)

FDR(data = mydata, sp.cols= 2, var.cols = 5:17, correction = “BY”)

(install fuzzySim and get help file ?FDR for reproducible examples with sample data)

References:

Barbosa A.M. (2015) fuzzySim: applying fuzzy logic to binary similarity indices in ecology. Methods in Ecology and Evolution, 6: 853-858

Barbosa A.M., Real R. & Vargas J.M (2009) Transferability of environmental favourability models in geographic space: The case of the Iberian desman (Galemys pyrenaicus) in Portugal and Spain. Ecological Modelling 220: 747-754

Benjamini Y. & Hochberg Y. (1995) Controlling the false discovery rate: a practical and powerful approach to multiple testing. Journal of the Royal Statistical Society, Series B 57: 289-300

Benjamini Y. & Yekutieli D. (2001) The control of the false discovery rate in multiple testing under dependency. Annals of Statistics 29: 1165-1188.

García L.V. (2003) Controlling the false discovery rate in ecological research. Trends in Ecology and Evolution 18: 553-554