## Poisson Regression: Horseshoe crab example

## Data set from Agresti, 2007.

# Data: Study of nesting horseshoe crabs (J. Brockman, Ethology, 1996).
# Each female crab in the study had a male crab attached to her in her nest.
# The study investigated factors that affect whether the female crab had
# any other males, called satellites, residing nearby her.
# Explanatory variables thought possibly to affect this included the
# female crab's color, spine condition, weight, and carapace width.
# The response outcome for each female crab is her number of satellites.

# Variable descriptions:
# color: 1 - light medium, 2 - medium, 3 - dark medium, 4 - dark
# spine: 1 - both good, 2 - one worn or broken, 3 - both worn or broken
# width: carapace width in cm
# satell: number of satellites
# weight: weight in kg

crab <- read.table("http://www.math.montana.edu/shancock/data/horseshoe.txt",header=T)
crab$colorF <- factor(crab$color, levels=c(1,2,3,4), labels=c("LM","M","DM","D"))
crab$spineF <- factor(crab$spine, levels=c(1,2,3), labels=c("Good","Soso","Poor"))


# Summary statistics for each variable:
summary(crab)

# Plot all variables against one another
plot(crab)

boxplot(satell ~ color, data=crab, xlab="Color", ylab="No. of Satellites")
boxplot(satell ~ spine, data=crab, xlab="Spine Condition", ylab="No. of Satellites")
# Best predictors of satell?


## Poisson regression with one predictor
crab.glm <- glm(satell ~ width, family = poisson(link = "log"), data = crab)
# Note that the log link is the default for poisson, 
# so we could have used
# family = poisson
summary(crab.glm)

# How well does the model fit?
# Add fitted model to plot.
plot(satell~width, xlab="Width (cm)", ylab="No. of Satellites", data=crab)
co <- crab.glm$coefficients
curve(exp(co[1]+co[2]*x),add=T)

## Check for overdispersion:
# Create 8 width categories used in book
# 1 = <23.25
# 2 = 23.25-24.25
# 3 = 24.25-25.25
# 4 = 25.25-26.25
# 5 = 26.25-27.25
# 6 = 27.25-28.25
# 7 = 28.25-29.25
# 8 = >29.25 - Note: Book error = 30.25
crab$width.grp <- cut(crab$width, breaks = c(0, seq(23.25,29.25,by=1), 35), labels=c(1:8))
head(cbind(crab$width, crab$width.grp))

# cut function creates factor (R treats as nominal):
is.factor(crab$width.grp)
# numeric vs. factor --> side-by-side boxplots for default plot:
plot(satell ~ width.grp, data=crab)
xtabs(~ satell+width.grp, data=crab)

# Look at fitted model over the means for each group.
means <- tapply(crab$satell, crab$width.grp, mean)
plot(seq(22.75,29.75,by=1),means,xlab="Width",ylab="Mean No. of Satellites")
curve(exp(co[1]+co[2]*x),add=T)

## Check for overdispersion
means
vars <- tapply(crab$satell, crab$width.grp, var)
vars
# The variances in each group are much larger than the means for each group,
# indicating overdispersion is present.
# --> May indicate need to include other predictors to reduce unexplained variability.


## Poisson regression with two predictors
# Treating color as categorical
crab.glm2 <- glm(satell ~ width*colorF, data=crab, family=poisson)
# Note that width*colorF is equivalent to
# 1 + width + colorF + width:colorF
summary(crab.glm2)
# How to interpret coefficient estimates?

## Plot # of satellites vs. width by color
plot(satell ~ width, data=crab, xlab="Width (cm)", ylab="No. of Satellites", pch=color, col=(color+1))
legend("topright",c("LM","M","DM","D"),pch=c(1,2,3,4),col=c(2,3,4,5))
# Add fitted models by color
co <- crab.glm2$coef
# LM
curve(exp(co[1]+co[2]*x), col=2, add=TRUE)
# M
curve(exp(co[1]+co[3] + (co[2]+co[6])*x), col=3, add=TRUE)
# DM
curve(exp(co[1]+co[4] + (co[2]+co[7])*x), col=4, add=TRUE)
# D
curve(exp(co[1]+co[5] + (co[2]+co[8])*x), col=5, add=TRUE)

### Additive model? (no interaction)
crab.glm2a <- glm(satell ~ width + colorF, data=crab, family=poisson)
co <- crab.glm2a$coef
# LM
curve(exp(co[1]+co[2]*x), col=2, add=TRUE, lty=2)
# M
curve(exp(co[1]+co[3] + (co[2])*x), col=3, add=TRUE, lty=2)
# DM
curve(exp(co[1]+co[4] + (co[2])*x), col=4, add=TRUE, lty=2)
# D
curve(exp(co[1]+co[5] + (co[2])*x), col=5, add=TRUE, lty=2)


## Poisson regression with two predictors
# Treating color as quantitative (is this valid?)
crab.glm3 <- glm(satell ~ width*color, data=crab, family=poisson)
summary(crab.glm3)
# Note significant interaction term.

## Add fitted model where color is quantitative to plot:
co <- crab.glm3$coef
# LM
curve(exp(co[1] + co[2]*x + co[3]*1 + co[4]*x*1), col=2, add=TRUE, lty=3, lwd=2)
# M
curve(exp(co[1] + co[2]*x + co[3]*2 + co[4]*x*2), col=3, add=TRUE, lty=3, lwd=2)
# DM
curve(exp(co[1] + co[2]*x + co[3]*3 + co[4]*x*3), col=4, add=TRUE, lty=3, lwd=2)
# D
curve(exp(co[1] + co[2]*x + co[3]*4 + co[4]*x*4), col=5, add=TRUE, lty=3, lwd=2)


# Effect of a one cm increase in width for each color:
for(i in 1:4){
	print(exp(crab.glm3$coef[2]+crab.glm3$coef[4]*i))
}

# Visualize fitted model:
plot(satell ~ width, pch=color, col=(color+1), cex=2, data=crab)
co2 <- crab.glm2$coef
for(i in 1:4){
	curve(exp(co2[1]+co2[2]*x+co2[3]*i+co2[4]*x*i),add=T,col=i+1,lty=i+1)
}
legend("topleft",c("Light Med","Med","Dark Med","Dark"),col=2:5,lty=2:5)