Select among correlated variables based on a given criterion

Correlations among variables are problematic in multivariate models, as they inflate the variance of coefficients and thus may bias the interpretation of the effects of those variables on the response. One of the strategies to circumvent this problem is to eliminate a priori one from each pair of correlated variables, but it is not always straightforward to pick the right variable among these. The corSelect function selects among correlated variables based either on their (stepwise) variance inflation factor (VIF); or on their relationship with the response variable, by building a bivariate model of each individual variable against the response and choosing, among each of two correlated variables, the one with the best p-value, AIC or BIC.

This function is now included in fuzzySim and depends on other functions in the package, such as FDR and multicol. It is also included as a new option within function multGLM in this package, so that multiple models can be built based on uncorrelated variables chosen appropriately for each species.

corSelect <- function(data, sp.cols = NULL, var.cols, cor.thresh = 0.8, select = "p.value", ...) {

# version 1.6 (6 Jul 2017)

if (length(sp.cols) > 1) stop ("Sorry, 'corSelect' is currently implemented for only one 'sp.col' at a time.")

univar.criteria <- c("VIF")
 bivar.criteria <- c("p.value", "AIC", "BIC")

if (!(select %in% c(univar.criteria, bivar.criteria))) stop ("Invalid 'select' criterion.")

if (!is.null(sp.cols) & select %in% bivar.criteria) { <- nrow(data)
 data <- data[is.finite(data[ , sp.cols]), ]
 n.out <- nrow(data)
 if (n.out < warning ( - n.out, " observations removed due to missing data in 'sp.cols'; ", n.out, " observations actually evaluated.")

cor.mat <- cor(data[ , var.cols], ...)
 cor.mat[upper.tri(cor.mat, diag = TRUE)] <- NA
 high.cor.mat <- bivar.mat <- numeric(0)

if (max(abs(cor.mat), na.rm = TRUE) >= cor.thresh) {
 high.cor.rowcol <- as.matrix(which(abs(cor.mat) >= cor.thresh, arr.ind = TRUE))
 high.cor.inds <- sort(unique(as.vector(high.cor.rowcol)))
 high.cor.mat <- data.frame(var1 = rownames(cor.mat)[high.cor.rowcol[ , "row"]], var2 = colnames(cor.mat)[high.cor.rowcol[ , "col"]])
 high.cor.mat$corr <- NULL
 for (r in 1:nrow(high.cor.mat)) high.cor.mat$corr[r] <- cor.mat[high.cor.rowcol[ ,"row"][r], high.cor.rowcol[ ,"col"][r]]
 high.cor.mat <- high.cor.mat[order(abs(high.cor.mat$corr), decreasing = TRUE), ]

if (is.null(sp.cols) & select %in% bivar.criteria) {
 message("Bivariate relationships not assessable without a response variable ('sp.cols'). Returning high pairwise correlations among 'var.cols'.")
 return (high.cor.mat)

high.cor.vars <- unique(rownames(cor.mat[high.cor.inds, high.cor.inds]))

if (select %in% bivar.criteria) {
 bivar.mat <- FDR(data = data, sp.cols = sp.cols, var.cols = match(high.cor.vars, colnames(data)), simplif = TRUE)[ , c("p.value", "AIC", "BIC")]
 if (all.equal(order(bivar.mat[ , c("p.value")]), order(bivar.mat[ , c("AIC")]), order(bivar.mat[ , c("BIC")]))) message("Results identical using whether p-value, AIC or BIC to select variables.\n")
 else message("Results NOT identical using whether p-value, AIC or BIC to select variables.\n")
 } # end if select in bivar

data.remaining <- data[ , var.cols]
 while (max(abs(cor.mat), na.rm = TRUE) >= cor.thresh) {
 max.cor.ind <- which(abs(cor.mat) == max(abs(cor.mat), na.rm = TRUE), arr.ind = TRUE)
 v1 <- rownames(cor.mat)[max.cor.ind[1]]
 v2 <- colnames(cor.mat)[max.cor.ind[2]]
 if (select %in% univar.criteria) {
 vif <- multicol(data.remaining)
 targets <- vif[c(v1, v2), "VIF", drop = FALSE]
 } # end if univar
 else if (select %in% bivar.criteria) {
 targets <- bivar.mat[c(v1, v2), select, drop = FALSE]
 } # end if bivar

exclude <- which.max(as.matrix(targets)) <- rownames(targets)[exclude]
 excl.ind <- match(, colnames(cor.mat))
 data.remaining <- data.remaining[ , -excl.ind, drop = FALSE]
 cor.mat <- cor.mat[-excl.ind, -excl.ind, drop = FALSE]
 if (all( cor.mat <- numeric(0)
 if (length(cor.mat) == 0) break
 } # end while max

} # end if max > thresh

selected.vars <- colnames(cor.mat)
 selected.var.cols <- match(colnames(cor.mat), colnames(data))
 excluded.vars <- colnames(data)[var.cols][!(colnames(data)[var.cols] %in% colnames(cor.mat))]

vif2 <- multicol(data[ , selected.var.cols])

list(high.correlations = high.cor.mat,
 bivariate.significance = bivar.mat,
 excluded.vars = excluded.vars,
 selected.vars = selected.vars,
 selected.var.cols = selected.var.cols,
 strongest.remaining.corr = cor.mat[which.max(abs(cor.mat))],
 remaining.multicollinearity = vif2
} # end corSelect function


You can install and load fuzzySim (>=version 1.5) and then check help(corSelect) for reproducible usage examples.


Plot column(s) of a polygon vector map

NOTE: I recently had a tutorial on the cartography R package, which makes mapping columns of a data frame much less troublesome. You may want to look at that instead.

If you have a polygon vector map in R and want to quickly map the values in one or more columns of its attribute table, you can use the plotMapColumns function. There’s an option to rasterize the map before plotting, which may be considerably faster and is TRUE by default, but you’ll need to use an appropriate raster.upscale value. This is the number by which the range of coordinates should be divided to get the number of pixels for the maps to be plotted; it’s advised to first check your range(coordinates(map)) and see for yourself which raster.upscale divisor will make reasonably computable raster maps – e.g., for geographical lat-lon, an upscale factor of 1 will usually work (you’ll have at most 360 x 180 pixels; actually you may want to lower raster.upscale to 0.5 or 0.3 if you need more detailed maps); but for a UTM projection (whose coordinates can be much larger values) you may need an upscale.factor of 10000 to get a reasonably computable number of pixels.

plotMapColumns <- function(map, # SpatialPolygons object
                           columns, # index(es) of column(s) of map@data containing the values to plot (there will be one output map per column)
                           rasterize = TRUE, # number by which the difference between maximum and minimum coordinates should be divided to get the number of pixels (if rasterize = TRUE); it's advised to first calculate min and max coordinates and see for yourself which divisor will make reasonably computable raster maps (e.g., for geographical lat-lon an upscale factor of 1 may work, but for a UTM projection you may need an upscale of factor 10,000!)
                           raster.upscale = 1, 
                           ...) # additional arguments for (sp)plot function

  stopifnot(raster.upscale > 0 | is.null(raster.upscale),
            require(raster) | rasterize == FALSE
  if (!all(columns %in% 1:length(names(map)))) stop ("index out of bounds; 'columns' must exist in map@data.")
  if (rasterize) {
    xmin <- min(coordinates(map)[,1])
    xmax <- max(coordinates(map)[,1])
    ymin <- min(coordinates(map)[,2])
    ymax <- max(coordinates(map)[,2])
    wdth <- round(xmax - xmin)
    hght <- round(ymax - ymin)
    #if (raster.upscale == "auto") {
      #max.length <- max(wdth, hght)
      #if (max.length > 500) raster.upscale <-
    #if (!is.null(max.rast.dim)) {
    #  rast.dim <- wdth * hght
    wdth <- wdth / raster.upscale
    hght <- hght / raster.upscale
    message("plotting map(s) with ", wdth, "x", hght, " pixels; consider rising 'raster.upscale' if this is taking too long, or lowering it if the resulting maps are too coarse.")
    rast <- raster(nrows = hght, ncols = wdth, xmn = xmin, xmx = xmax, ymn = ymin, ymx = ymax)
  }  # end if rasterize I
  #if (centroids) {
  #  attr.table <- map@data
  #  map <- SpatialPointsDataFrame(coordinates(map))
  #  map@data <- attr.table
  #  rast <- raster(map)
  #} else {
  n.cols <- length(columns)
  col.count <- 0
  for (i in columns) {
    col.count <- col.count + 1
    message("Plotting column ", col.count, " of ", n.cols, "...")
    if (rasterize) {
      map.rast <- rasterize(x = map, y = rast, field = names(map)[i], fun = 'last')
      plot(map.rast, main = names(map)[i], ...)
    }  # end if rasterize II
    else {
      print(spplot(map, zcol = names(map)[i], main = names(map)[i], ...))
    }  # end else
  }  # end for i

[presented with Pretty R]

Usage example:

# download, unzip and import a map of countries:
download.file("", destfile = "")
countries <- rgdal::readOGR(dsn = ".", layer= "countries")
# see the data in the attributes table:
# use plotMapColumns with and without rasterizing:
plotMapColumns(countries, columns = 17:18, rasterize = TRUE, raster.upscale = 1)
plotMapColumns(countries, columns = 18, rasterize = FALSE)  # slower

You can add arguments for the (sp)plot function, to get e.g. different colour schemes. The plotMapColumns function is not (yet) included in a package.

Analyse and compare stepwise model predictions

This function builds a generalized linear model with forward stepwise inclusion of variables, using AIC as the selection criterion, and gives the values predicted at each step, as well as their correlation with the final model predictions. This can be useful to see if a model with just the first few variables could already provide predictions very close to the final ones (see e.g. Fig. 3 in Muñoz et al., 2005). This function is now included in the fuzzySim package (Barbosa, 2015; version 1.3 just released). The correlation coefficient (cor.method) is “pearson” by default, but can be set to “spearman” or “kendall” as well.

stepByStep <- function(data, # name of dataset
                       sp.col, # index of species (target variable) column
                       var.cols, # indices of variables columns
                       family = binomial(link = "logit"),
                       Favourability = FALSE, # used only if family=binomial(logit)
                       trace = 0, # argument for 'step'
                       cor.method = "pearson") {
# version 1.3 (22 Jul 2015)
n.init <- nrow(data)
data <- na.omit(data)
na.loss <- n.init - nrow(data)
if (na.loss > 0) message(na.loss, " cases excluded due to missing values.")
response <- colnames(data)[sp.col]
null.model.formula <- as.formula(paste(response, "~", 1))
scope.formula <- as.formula(paste("~", paste(colnames(data)[var.cols], collapse = "+")))
mod <- step(glm(null.model.formula, family = family, data = data), scope = scope.formula, direction = "forward", trace = trace) <- mod$fitted.values
if (!all(c("binomial", "logit") %in% mod$family)) Favourability <- FALSE
if (Favourability) <- Fav(model = mod)
model.vars.split <- sapply(mod$anova[ , 1], strsplit, split = " ")
model.vars <- lapply(model.vars.split, `[`, 2)
model.vars <- as.character(model.vars)[-1]
n.steps <- length(model.vars)
preds <- favs <- = nrow(data), ncol = n.steps))
cor.P <- cor.F <- vector("numeric", n.steps)
names(model.vars) <- names(preds) <- names(favs) <- names(cor.P) <- names(cor.F) <- paste0("step", 1:n.steps)
for (s in 1:n.steps) {
step.vars <- model.vars[1:s]
mod.step <- glm(as.formula(paste(response, "~", paste(step.vars, collapse = "+"))), family = family, data = data)
if (Favourability) {
favs[ , s] <- Fav(model = mod.step)
cor.F[s] <- cor(favs[ , s],, method = cor.method)
else {
preds[ , s] <- mod.step$fitted.values
cor.P[s] <- cor(preds[ , s],, method = cor.method)
}; rm(s)
if (Favourability) result <- list(predictions = favs, correlations = cor.F, variables = model.vars, model = mod)
else result <- list(predictions = preds, correlations = cor.P, variables = model.vars, model = mod)

Usage example:

stepByStep(data = rotif.env, sp.col = 18, var.cols = 5:17, cor.method = "spearman")
stepByStep(data = rotif.env, sp.col = 18, var.cols = 5:17, Favourability = TRUE, cor.method = "pearson")
stepByStep(data = rotif.env, sp.col = 18, var.cols = 5:17, family = poisson)

[presented with Pretty R]

Only forward selection is implemented for now. I may or may not eventually find the time to implement other selection methods.


Barbosa A.M. (2015) fuzzySim: applying fuzzy logic to binary similarity indices in ecology. Methods in Ecology and Evolution, 6: 853-858

Muñoz, A.R., Real R., BARBOSA A.M. & Vargas J.M. (2005) Modelling the distribution of Bonelli’s Eagle in Spain: Implications for conservation planning. Diversity and Distributions 11: 477-486

New publications on modTools functions

Two articles have recently been published based on modTools material: a short one in the Journal of Brief Ideas featuring function standard01, and another in Methods in Ecology and Evolution featuring package fuzzySim (wich now includes modelling functions such as multGLM, FDR, multicol, etc. — see previous post). Please cite the articles when you use the functions!

Barbosa, A.M. (2015) Re-scaling of model evaluation measures to allow direct comparison of their values. Journal of Brief Ideas, 18 Feb 2015, DOI: 10.5281/zenodo.15487

Barbosa A.M. (2015) fuzzySim: applying fuzzy logic to binary similarity indices in ecology. Methods in Ecology and Evolution, 6: 853-858

Several modEvA functions moved to fuzzySim

Just a heads-up to let you know that the modelling functions formerly included in the modEvA package have been moved to package fuzzySim for operative and editorial reasons. These functions are: multGLM, modelTrim, Fav, getPreds, FDR, percentTestData, integerCols, multConvert and multicol. The rotif.env sample dataset has also moved to fuzzySim. Package modEvA now has dataset rotif.mods and keeps only functions strictly related to model evaluation and analysis, and not to model building.

The new modEvA version (currently 0.9) also includes some bug fixes, including one that made the packaged varPart function calculate the ABCoverlap incorrectly – thanks to Jurica Levatic for spotting this.

Calculate zonal statistics from rasters in multiple zip files

This is a wrapper for the zonalFromZip function published in the previous post, for when you have multiple zip files with multiple raster files each (as in the WorldClim paleo-climate database), and you want to extract zonal statistics for them all automatically. To use it, you’ll need to have the zonalFromZip function loaded, as well as the raster R package.

zonalsFromZips <- function(zip.files, zones.rast, rast.file.ext = ".tif", aux.file.ext = NULL, verbosity = 1, ...) {
  results <- vector("list", length(zip.files))
  names(results) <- basename(tools::file_path_sans_ext(zip.files))
  for (f in 1:length(zip.files)) {
    message("\nUNZIPPING FILE ", f, " OF ", length(zip.files), " (", basename(zip.files[f]), ")...")
    results[[f]] <- zonalFromZip(zip.file = zip.files[f], zones.rast = zones.rast, rast.file.ext = rast.file.ext, aux.file.ext = aux.file.ext, verbosity = verbosity, ...)
  }; rm(f)
  message("\nFINISHED ALL!")
}  # end zonalsFromZips function


The result is a list of dataframes, each containing the zonal stats for one of the .zip files of rasters. Usage example:

LGM.zonals <- zonalsFromZips(zip.files = list.files("/home/joe/LGM", full.names = TRUE), zones.rast = utm10.raster)

[presented with Pretty R]


Calculate zonal statistics from rasters in a zip file

Imagine you have a zipped folder with a bunch of raster maps containing variables (e.g. the ones you can download from WorldClim or from CliMond), and you need to calculate zonal statistics from each of these rasters. The zonaFromZip function, provided below, automates this process without the need to unzip the folders. It extracts one raster at a time from the .zip, imports it to R, calculates zonal statistics for your zones raster map (the ‘mean’ function is used by default, but you can provide any other argument accepted by the zonal function of the R raster package), and then deletes the unzipped file before unzipping the next one, therefore requiring minimal disk space.

zonalFromZip <- function (zip.file, zones.rast, rast.file.ext = ".tif", aux.file.ext = NULL, delete.unzipped = TRUE, verbosity = 2, ...)
  # zip.file: path to the zip containing the raster maps to calculate zonal stats from
  # zones.rast: raster map (in your R workspace) containing the spatial units to calculate zonal stats to
  # rast.file.ext: file extension for the raster maps on disk
  # aux.file.ext: file extension for the auxiliary files (e.g. ".hdr" for .bil raster files, or ".rdc" for Idrisi .rst files)
  # ...: arguments to pass to the 'zonal' function from the 'raster' package
  rast.files <- unzip(zip.file, list = TRUE) $ Name
  var.names <- unique(tools::file_path_sans_ext(rast.files))
  n.var <- length(var.names)
  zonal.stats <- vector("list", length(var.names))
  names(zonal.stats) <- var.names
  for (i in 1:n.var) {
    if (verbosity >= 1) message("Getting variable ", i, " of ", n.var)
    if (verbosity >= 2) message("  - unzipping file...")
    unzip(zip.file, files = paste0(var.names[i], rast.file.ext))
    if (!is.null(aux.file.ext)) unzip(zip.file, files = paste0(var.names[i], aux.file.ext))
    var.rast <- raster(paste0(var.names[i], rast.file.ext))
    if (!compareRaster(var.rast, zones.rast, stopiffalse = FALSE)) {
      if (verbosity >= 2) message("  - cropping to zones raster...")
      var.rast <- crop(var.rast, zones.rast)
    if (verbosity >= 2) message("  - calculating zonal stats...")
    zonal.stats[[i]] <- raster::zonal(var.rast, zones.rast, ...)
    if (verbosity >= 2) message("  - deleting unzipped file...")
    if (delete.unzipped) {
      unlink(list.files()[grep(pattern = paste0(var.names[i], rast.file.ext), x = list.files())])
      if (!is.null(aux.file.ext)) unlink(list.files()[grep(pattern = paste0(var.names[i], aux.file.ext), x = list.files())])
  if (verbosity >= 1) message("Converting results to data frame...")
  zonal.stats <-
  zonal.stats <- subset(zonal.stats, select = c(1, seq(2, ncol(zonal.stats), 2)))
  colnames(zonal.stats)[1] <- "zone"
  colnames(zonal.stats)[-1] <- var.names
  if (verbosity >= 1) message("Finished!")


Mind that you need the raster R package installed for this, and a raster map of the spatial units (zones) to which you want to extract the raster variables.Usage examples:

LGM.CCSM4.utm10 <- zonalFromZip(zip.file = "LGM/", zones.rast = utm10.rst, rast.file.ext = ".tif", aux.file.ext = NULL)
WClim.utm10 <- zonalFromZip(zip.file = "", zones.rast = utm10.rst, rast.file.ext = ".bil", aux.file.ext = ".hdr")

Example for several .zip files within a folder at once (DEPRECATED – see next post’s zonalsFromZips function instead):

for (f in list.files("LGM")) {  # "LGM" is the folder containing the zip files to extract zonal stats from
  name <- paste("LGM", tools::file_path_sans_ext(f), "utm10", sep = ".")
  zonstat <- zonalFromZip(zip.file = paste("LGM", f, sep = "/"), zones.rast = u10.rst, raster.ext = ".tif", fun = "mean")
  assign(name, zonstat)

[presented with Pretty R]