\documentclass[english]{beamer} \usepackage{natbib} \usepackage[T1]{fontenc} \usepackage[latin1]{inputenc} \usepackage{amsmath} %\usepackage{multicolumn} \usepackage{color} \usepackage{amssymb} \usepackage{graphicx} \renewcommand{\emph}[1]{{\color{red} {\textbf{#1}}}} \bibliographystyle{notitle} \usetheme{Warsaw} \setbeamercovered{transparent} \usepackage{babel} \begin{document} \makeatletter \def\newblock{\beamer@newblock} \makeatother \SweaveOpts{height=7,width=10} \setkeys{Gin}{width=\textwidth} <>= ##library(deSolve) library(reshape) library(lattice) library(lme4) library(plotrix) #source("virulence_funs.R") @ <>= x <- read.csv("culcitalogreg.csv") x$ttt <- as.factor(x$ttt) ## treatment should be a factor cmat <- matrix(c(0,1,1,1,0,1,-1,0,0,1,1,-2),ncol=3) colnames(cmat) <- c("symb","crab.vs.shr","addsymb") ## cmat <- sweep(cmat,2,sqrt(colSums(cmat^2)),"/") ## normalize if desired: unnecessary contrasts(x$ttt) <- cmat ## contrasts: control vs symbiont (ttt1), ## crab vs shrimp (ttt2), ## 1 vs 2 symbionts (ttt3) x$block <- as.factor(x$block) ## not really necessary but logical x <- x[,1:4] ## don't really need the rest set.seed(1001) ## will be doing some simulations later @ <>= library(lattice) library(reshape) levels(x$ttt) <- c("none","shrimp","crabs","both") m <- melt(x[,1:3],id.vars=1:2) m2 <- recast(m,ttt~variable,fun.aggregate=mean) m3 <-recast(m,ttt+block~variable,fun.aggregate=sum) p <- with(m3,table(predation,ttt)) library(plotrix) @ %% NC State University Center for turfgrass environmental %% research & education, www.turfgrass.ncsu.edu \title[GLMM for ecologists]{Generalized linear mixed models for ecologists and evolutionary biologists} \author{Ben~Bolker, University of Florida} \institute{Harvard Forest} \date{27 February 2009} % \pgfdeclareimage[height=0.5cm]{uflogo}{letterhdwm} % \logo{\pgfuseimage{uflogo}} \AtBeginSubsection[]{ \frame{ \frametitle{Outline} \tableofcontents[currentsection,currentsubsection] } } \begin{frame} \titlepage \end{frame} % \beamerdefaultoverlayspecification{<+->} \begin{frame} \frametitle{Outline} \tableofcontents{} \end{frame} \section{Precursors} \subsection{Generalized linear models} \begin{frame} \frametitle{Generalized linear models (GLMs)} \begin{itemize} \item non-normal data, (some) nonlinear relationships \pause \item presence/absence, alive/dead (binomial); count data (Poisson, negative binomial) \pause \item nonlinearity via \emph{link} function $L$: response is nonlinear, but $L(\mbox{response})$ is linear (e.g. log/exponential, logit/logistic) \pause \item modeling via \emph{linear predictor} is very flexible: $L(\mbox{response}) = a + b_i + c x \ldots$ \pause \item stable, robust, efficient: typical applications are \emph{logistic regression} (binomial/logit link), \emph{Poisson regression} (Poisson/log link) \end{itemize} \end{frame} \begin{frame} \frametitle{Coral symbiont example} <>= op <- par(las=1,cex=1.5,bty="l") bb <- barplot(p,ylab="Number of blocks",xlab="Symbionts", main="# predation events") bloc <- apply(p,2,cumsum) midb <- rbind(bloc[1,]/2, (bloc[1,]+bloc[2,])/2, (bloc[2,]+bloc[3,])/2) text(bb[col(p)][p>0],midb[p>0],c(0,1,2)[row(p)][p>0]) par(op) @ \end{frame} \begin{frame} \frametitle{{\it Arabidopsis} example} <>= panel.stripplot2 <- function (x, y, jitter.data = FALSE, factor = 0.5, amount = NULL, horizontal = TRUE, groups = NULL, ...) { if (!any(is.finite(x) & is.finite(y))) return() panel.sizeplot(x = x, y = y, jitter.x = jitter.data && !horizontal, jitter.y = jitter.data && horizontal, factor = factor, amount = amount, groups = groups, horizontal = horizontal, ...) } load("Banta.RData") trellis.par.set(list(fontsize=list(text=20))) print(stripplot(jltf ~ amd|nutrient, data=within(dat.tf,jltf <-jitter(log(total.fruits+1), amount=0.05)), strip=strip.custom(strip.names=c(TRUE,TRUE)), groups=gen, type=c('p','a'), ylab="Log(1+fruit set)", main="panel: nutrient, color: genotype")) @ \end{frame} \subsection{Mixed models (LMMs)} \begin{frame} \frametitle{Random effects (RE)} \begin{itemize} \item examples: experimental or observational ``blocks'' (temporal, spatial); species or genera; individuals; genotypes \pause \item inference on \emph{population} of units rather than individual units \pause \item (?) units randomly selected from all possible units \pause \item (?) reasonably large number of units \end{itemize} \end{frame} \begin{frame} \frametitle{Mixed models: classical approach} \begin{itemize} \item Partition sums of squares, calculate null expectations if fixed effect is 0 (all coefficients $\beta_i=0$) or RE variance=0 \pause \item{Figure out numerator (model) \& denominator (residual) sums of squares and degrees of freedom \begin{itemize} \item \emph{Model SSQ, df}: variability explained by the ``effect'' (difference between model with and without the effect) and number of parameters used \item \emph{Residual SSQ, df}: variability caused by finite sample size (number of observations minus number ``used up'' by the model) \end{itemize}} \end{itemize} \end{frame} \begin{frame} \frametitle{Classical LMM cont.} \begin{itemize} \item Robust, practical \item{OK if \begin{itemize} \item data are normally distributed \item design is (nearly) \emph{balanced} \item design not too complicated (single RE, or nested REs) \\ (\emph{crossed} REs: e.g. year effects that apply across all spatial blocks) \end{itemize}} \end{itemize} \end{frame} \begin{frame} \frametitle{Mixed models: modern approach} \begin{itemize} \item Construct a \emph{likelihood} for the data ($\mbox{Prob}(\mbox{observing data}|\mbox{parameters})$) --- in mixed models, requires integrating over possible values of REs \pause \item e.g.: \begin{itemize} \item likelihood of $i$\textsuperscript{th} obs. in block $j$ is $L_{\mbox{\small Normal}}(x_{ij}|\theta_i,\sigma^2_w)$ \item likelihood of a particular block mean $\theta_j$ is $L_{\mbox{\small Normal}}(\theta_j|0,\sigma^2_b)$ \item overall likelihood is $\int L(x_{ij}|\theta_j,\sigma^2_w) L(\theta_j|0,\sigma^2_b) \, d\theta_j$ \end{itemize} \pause \item Figure out how to do the integral \end{itemize} \end{frame} \begin{frame} \frametitle{Shrinkage} <>= z<- subset(dat.tf,amd=="clipped" & nutrient=="1") m1 <- glm(total.fruits~gen-1,data=z,family="poisson") m2 <- glmer(total.fruits~1+(1|gen),data=z,family="poisson") tt <- table(z$gen) rr <- unlist(ranef(m2)$gen)[order(coef(m1))]+fixef(m2) m1s <- sort(coef(m1)) m1s[1:2] <- rep(-5,2) op <- par(las=1,cex=1.5,bty="l") plot(m1s,xlab="Genotype",ylab="Mean(log) fruit set", axes=FALSE,xlim=c(-0.5,25),main="Arabidopsis block estimates") axis(side=1) axis(side=2,at=c(-5,-3,0,3), labels=c(-15,-3,0,3)) ## ylim=c(-3,5)) gsd <- attr(VarCorr(m2)$gen,"stddev") gm <- fixef(m2) nseq <- seq(-3,6,length=50) nv <- dnorm(nseq,mean=gm,sd=gsd) polygon(c(rep(0,50),nv*10),c(rev(nseq),nseq),col="gray",xpd=NA) points(rr,pch=16,col=2) text(seq_along(rr),rr,tt[order(coef(m1))],pos=3,xpd=NA,cex=0.6) box() axis.break(axis=2,breakpos=-4) par(op) @ \end{frame} \begin{frame} \frametitle{RE examples} \begin{itemize} \item Coral symbionts: simple experimental blocks, RE affects intercept (overall probability of predation in block) \item {\it Arabidopsis}: region (3 levels, treated as fixed) / population / genotype: affects intercept (overall fruit set) as well as treatment effects (nutrients, herbivory, interaction) \end{itemize} \end{frame} \section{GLMMs} \subsection{Estimation} \begin{frame} \frametitle{Penalized quasi-likelihood (PQL)} \begin{itemize} \item alternate steps of estimating GLM given known block variances; estimate LMMs given GLM fit \pause \item flexible (allows spatial, temporal correlations, crossed REs, etc.) \pause \item but: ``quasi'' likelihood only (inference problems?) \pause \item \emph{biased} for small unit samples (e.g. counts $<5$, binomial with min(success,failure)$<5$) (!!) \citep{breslow_whither_2004} \pause \item nevertheless, \emph{widely} used: SAS {\tt PROC GLIMMIX}, R {\tt glmmPQL}: ``95\% of analyses of binary responses (n = 205), 92\% of Poisson responses with means less than 5 (n = 48) and 89\% of binomial responses with fewer than 5 successes per group (n = 38)'' \end{itemize} \end{frame} \begin{frame} \frametitle{Better methods} \begin{itemize} \item{\emph{Laplace approximation} \begin{itemize} \item approximate integral ($\int L(\mbox{data}|\mbox{block}) L(\mbox{block}|\mbox{block variance})$) by Taylor expansion \item considerably more accurate than PQL \item reasonably fast and flexible \end{itemize}} \pause \item{\emph{adaptive Gauss-Hermite quadrature} ([A]G[H]Q) \begin{itemize} \item compute additional terms in the integral \item most accurate \item slowest, hence not flexible (2--3 RE at most, maybe only 1) \end{itemize}} \end{itemize} \pause Becoming available: R {\tt lme4}, SAS {\tt PROC NLMIXED}, {\tt PROC GLIMMIX} (v. 9.2), Genstat {\tt GLMM} \end{frame} <>= mod1=glm(predation~ttt,binomial,data=x) library(lme4) mod2 <- glmer(predation~ttt+(1|block),family=binomial,data=x) ## AGQ mod2B <- glmer(predation~ttt+(1|block),family=binomial, nAGQ=8,data=x) library(glmmML) ## Laplace (default) mod3=glmmML(predation~ttt,family=binomial,data=x,cluster=block) ## AGQ (=GHQ) with 8 points mod3B=glmmML(predation~ttt,family=binomial,data=x,cluster=block, method="ghq") library(glmmADMB) mod4=glmm.admb(predation~ttt,random=~1,group="block", data=x,family="binomial",link="logit") detach("package:lme4") library(MASS) library(nlme) mod5=glmmPQL(predation~ttt,random=~1|block,family=binomial,data=x) pqlfix <- fixef(mod5) pqlran <- as.numeric(VarCorr(mod5)["(Intercept)","StdDev"]) detach("package:nlme") @ <>= library(lme4) fixefmat <- cbind(rbind(glm=coef(mod1), glmmPQL=pqlfix, glmer.Laplace=fixef(mod2), glmmML.Laplace=coef(mod3), glmmADMB.Laplace=mod4$b, glmer.AGQ=fixef(mod2B), glmmML.AGQ=coef(mod3B))) sdvec <- c(0,pqlran, attr(VarCorr(mod2)[[1]],"stddev"), mod3$sigma, NA, attr(VarCorr(mod2B)[[1]],"stddev"), mod3B$sigma) compmat <- round(cbind(fixefmat,sdran=sdvec),3) colnames(compmat) <- c("intercept","symbiont","crab.vs.shrimp","twosymb","sdran") @ \begin{frame} \frametitle{Comparison of coral symbiont results} <>= ## trellis.par.set(list(fontsize=list(text=14))) m <- melt(compmat) m$X1 <- factor(m$X1,levels=rownames(compmat)) m$X2 <- factor(m$X2,levels=colnames(compmat)) print(xyplot(X1~value|X2, data=m,pch=16, ## scales=list(x=list(relation="free")), scales=list(y=list(alternating=FALSE)), xlab="",ylab="")) @ \end{frame} \subsection{Inference} \begin{frame} \frametitle{General issues: testing RE significance} \begin{itemize} \item{Counting ``model'' df for REs \begin{itemize} \item how many parameters does a RE require? 1 (variance)? Or somewhere between 1 and $n$ (shrinkage estimator)? Hard to compute, and depends on the \emph{level of focus} \citep{vaida_conditional_2005} \end{itemize}} \pause \item{Boundary effects for RE testing \begin{itemize} \item most derivations of null distributions depend on the null-hypothesis value of the parameter being \emph{within} its feasible range (i.e. not on the boundary) \citep{molenberghs_likelihood_2007} \item REs may count for $<1$ df (typically $\approx$ 0.5) \item if ignored, tests are (slightly) conservative \end{itemize}} \end{itemize} \end{frame} \begin{frame} \frametitle{General issues: finite-sample issues (!)} How far are we from ``asymptopia''? \begin{itemize} \item{Many standard procedures are \emph{asymptotic} \begin{itemize} \item Likelihood Ratio Test \item AIC \end{itemize}} \pause \item{``Sample size'' may refer the number of RE \emph{units} --- often far more restricted than total number of data points, so finite-sample problems are more pressing} \pause \item{Degree of freedom counting is hard for complex designs: \emph{Kenward-Roger correction}} \end{itemize} \end{frame} \begin{frame} \frametitle{Specific procedures} \begin{itemize} \item \emph{Likelihood Ratio Test}: \\ unreliable for fixed effects except for large data sets (= large \# of RE units!) \pause \item{\emph{Wald} ($Z$, $\chi^2$, $t$ or $F$) tests \begin{itemize} \item crude approximation \item asymptotic (for non-overdispersed data?) or \ldots \item \ldots how do we count residual df? \item don't know if null distributions are correct \end{itemize}} \pause \item{AIC \begin{itemize} \item asymptotic \item could use AIC$_c$, but ? need residual df \end{itemize}} \end{itemize} \end{frame} \begin{frame} \frametitle{{\it Arabidopsis} results (sort of)} \begin{itemize} \item RE model selection by (Q)AIC: intercept difference among populations, genotypes vary in all parameters (1+nutrient+clipping+ nutrient $\times$ clipping) \item fixed effects: nutrient, others weak \end{itemize} \end{frame} \begin{frame} \frametitle{{\it Arabidopsis} genotype effects} <>= mq6 <- lmer(total.fruits ~ nutrient*amd + rack + status + (1|popu) + (amd*nutrient|gen), data=dat.tf, family="quasipoisson") mq6b <- update(mq6, .~. - amd - nutrient:amd) re.gen <- ranef(mq6b)[["gen"]] names(re.gen) <- c("Control", "Clipping", "Nutrients", "Clip+Nutr.") regs <- dat.tf[match(rownames(re.gen), dat.tf[["gen"]]), "reg"] nice.cols <- hcl(c(60,140, 220), c=50,l=67, alpha=.65)[as.numeric(regs)] symbs <- c(19, 17,15)[as.numeric(regs)] print( splom(re.gen , col=nice.cols, pch=symbs, cex=1.2) ) @ \end{frame} \begin{frame} \frametitle{Where are we?} \includegraphics{decision_tree2} \end{frame} \begin{frame} \frametitle{Now what?} \begin{itemize} \item MCMC (finicky, slow, dangerous, we have to ``go Bayesian'': specialized procedures for GLMMs, or WinBUGS translators? ({\tt glmmBUGS}, {\tt MCMCglmm}) \item quasi-Bayes {\tt mcmcsamp} in {\tt lme4} (unfinished!) \pause \item{parametric bootstrapping: \begin{itemize} \item fit null model to data \item simulate ``data'' from null model \item fit null and working model, compute likelihood diff. \item repeat to estimate null distribution \end{itemize}} \end{itemize} More info: \emph{\url{glmm.wikidot.com}} \end{frame} \begin{frame} \frametitle{Acknowledgements} \begin{itemize} \item Data: Josh Banta and Massimo Pigliucci ({\it Arabidopsis}); Adrian Stier and Sea McKeon (coral symbionts) \item Co-authors: Mollie Brooks, Connie Clark, Shane Geange, John Poulsen, Hank Stevens, Jada White \end{itemize} \end{frame} \renewcommand{\emph}[1]{{\em #1}} \begin{frame} \frametitle{References} \tiny \bibliography{glmm} \end{frame} \end{document}