## Poisson Regression: Horseshoe crab example ## Data set from Agresti, 2007. # Data: Study of nesting horseshoe crabs (J. Brockman, Ethology, 1996). # Each female crab in the study had a male crab attached to her in her nest. # The study investigated factors that affect whether the female crab had # any other males, called satellites, residing nearby her. # Explanatory variables thought possibly to affect this included the # female crab's color, spine condition, weight, and carapace width. # The response outcome for each female crab is her number of satellites. # Variable descriptions: # color: 1 - light medium, 2 - medium, 3 - dark medium, 4 - dark # spine: 1 - both good, 2 - one worn or broken, 3 - both worn or broken # width: carapace width in cm # satell: number of satellites # weight: weight in kg crab <- read.table("http://www.math.montana.edu/shancock/data/horseshoe.txt",header=T) crab$colorF <- factor(crab$color, levels=c(1,2,3,4), labels=c("LM","M","DM","D")) crab$spineF <- factor(crab$spine, levels=c(1,2,3), labels=c("Good","Soso","Poor")) # Summary statistics for each variable: summary(crab) # Plot all variables against one another plot(crab) boxplot(satell ~ color, data=crab, xlab="Color", ylab="No. of Satellites") boxplot(satell ~ spine, data=crab, xlab="Spine Condition", ylab="No. of Satellites") # Best predictors of satell? ## Poisson regression with one predictor crab.glm <- glm(satell ~ width, family = poisson(link = "log"), data = crab) # Note that the log link is the default for poisson, # so we could have used # family = poisson summary(crab.glm) # How well does the model fit? # Add fitted model to plot. plot(satell~width, xlab="Width (cm)", ylab="No. of Satellites", data=crab) co <- crab.glm$coefficients curve(exp(co[1]+co[2]*x),add=T) ## Check for overdispersion: # Create 8 width categories used in book # 1 = <23.25 # 2 = 23.25-24.25 # 3 = 24.25-25.25 # 4 = 25.25-26.25 # 5 = 26.25-27.25 # 6 = 27.25-28.25 # 7 = 28.25-29.25 # 8 = >29.25 - Note: Book error = 30.25 crab$width.grp <- cut(crab$width, breaks = c(0, seq(23.25,29.25,by=1), 35), labels=c(1:8)) head(cbind(crab$width, crab$width.grp)) # cut function creates factor (R treats as nominal): is.factor(crab$width.grp) # numeric vs. factor --> side-by-side boxplots for default plot: plot(satell ~ width.grp, data=crab) xtabs(~ satell+width.grp, data=crab) # Look at fitted model over the means for each group. means <- tapply(crab$satell, crab$width.grp, mean) plot(seq(22.75,29.75,by=1),means,xlab="Width",ylab="Mean No. of Satellites") curve(exp(co[1]+co[2]*x),add=T) ## Check for overdispersion means vars <- tapply(crab$satell, crab$width.grp, var) vars # The variances in each group are much larger than the means for each group, # indicating overdispersion is present. # --> May indicate need to include other predictors to reduce unexplained variability. ## Poisson regression with two predictors # Treating color as categorical crab.glm2 <- glm(satell ~ width*colorF, data=crab, family=poisson) # Note that width*colorF is equivalent to # 1 + width + colorF + width:colorF summary(crab.glm2) # How to interpret coefficient estimates? ## Plot # of satellites vs. width by color plot(satell ~ width, data=crab, xlab="Width (cm)", ylab="No. of Satellites", pch=color, col=(color+1)) legend("topright",c("LM","M","DM","D"),pch=c(1,2,3,4),col=c(2,3,4,5)) # Add fitted models by color co <- crab.glm2$coef # LM curve(exp(co[1]+co[2]*x), col=2, add=TRUE) # M curve(exp(co[1]+co[3] + (co[2]+co[6])*x), col=3, add=TRUE) # DM curve(exp(co[1]+co[4] + (co[2]+co[7])*x), col=4, add=TRUE) # D curve(exp(co[1]+co[5] + (co[2]+co[8])*x), col=5, add=TRUE) ### Additive model? (no interaction) crab.glm2a <- glm(satell ~ width + colorF, data=crab, family=poisson) co <- crab.glm2a$coef # LM curve(exp(co[1]+co[2]*x), col=2, add=TRUE, lty=2) # M curve(exp(co[1]+co[3] + (co[2])*x), col=3, add=TRUE, lty=2) # DM curve(exp(co[1]+co[4] + (co[2])*x), col=4, add=TRUE, lty=2) # D curve(exp(co[1]+co[5] + (co[2])*x), col=5, add=TRUE, lty=2) ## Poisson regression with two predictors # Treating color as quantitative (is this valid?) crab.glm3 <- glm(satell ~ width*color, data=crab, family=poisson) summary(crab.glm3) # Note significant interaction term. ## Add fitted model where color is quantitative to plot: co <- crab.glm3$coef # LM curve(exp(co[1] + co[2]*x + co[3]*1 + co[4]*x*1), col=2, add=TRUE, lty=3, lwd=2) # M curve(exp(co[1] + co[2]*x + co[3]*2 + co[4]*x*2), col=3, add=TRUE, lty=3, lwd=2) # DM curve(exp(co[1] + co[2]*x + co[3]*3 + co[4]*x*3), col=4, add=TRUE, lty=3, lwd=2) # D curve(exp(co[1] + co[2]*x + co[3]*4 + co[4]*x*4), col=5, add=TRUE, lty=3, lwd=2) # Effect of a one cm increase in width for each color: for(i in 1:4){ print(exp(crab.glm3$coef[2]+crab.glm3$coef[4]*i)) } # Visualize fitted model: plot(satell ~ width, pch=color, col=(color+1), cex=2, data=crab) co2 <- crab.glm2$coef for(i in 1:4){ curve(exp(co2[1]+co2[2]*x+co2[3]*i+co2[4]*x*i),add=T,col=i+1,lty=i+1) } legend("topleft",c("Light Med","Med","Dark Med","Dark"),col=2:5,lty=2:5)