# Data from Collett, 1991:
# Measure proportion of seeds of two species of Orobanche
# that germinate in bean and cucumber root extracts (hosts).

# Built-in dataset in the dispmod library
# You may need to first install this package - uncomment
# the following line:
# install.packages("dispmod", install.dependencies = TRUE)
library(dispmod)
data(orobanche)
?orobanche  # Data description
View(orobanche)

# Both predictors are categorical -> visualize?
orobanche$prop <- orobanche$germinated/orobanche$seeds
boxplot(prop ~ host + variety, data = orobanche)
library(yarrr)
pirateplot(prop ~ host + variety, inf.method="ci", theme=2, point.o = 1,
           data=orobanche,
           ylab="Proportion Germinated")
pirateplot(prop ~ variety + host, inf.method="ci", theme=2, point.o = 1,
           data=orobanche,
           ylab="Proportion Germinated")

total_germinated <- with(orobanche,
                         tapply(germinated, list(host,variety), sum)
)
total_seeds <- with(orobanche,
                         tapply(seeds, list(host,variety), sum)
)
props <- total_germinated/total_seeds
barplot(props, beside = TRUE, legend.text = TRUE,
        xlab = "Variety", ylab = "Proportion Germinated")
barplot(t(props), beside = TRUE, legend.text = TRUE,
        xlab = "Host", ylab = "Proportion Germinated")

# Note that we have binomial data, so we can fit
# the model either as proportions or binomial counts:
mod1 <- glm(germinated/seeds ~ host*variety, family=binomial,
            weights=seeds, data=orobanche)
# equivalently,
mod1 <- glm(cbind(germinated, seeds-germinated) ~ host*variety,
            family=binomial, data=orobanche)

summary(mod1)
# Lack of fit, but not clear how to amend systematic component:
pchisq(33.278, 17, lower.tail=FALSE)
# No extreme residuals:
residuals(mod1)
plot(mod1,1)

# Calculate Pearson's estimate of phi
presid <- residuals(mod1, type="pearson")
pChi2 <- sum(presid^2)
phihat <- pChi2/17
# Estimate of overdispersion parameter:
phihat
# Variance is almost twice as large than what we'd expect if
# the data were truely binomial.

# Re-fit the model accounting for overdispersion (quasi-likelihood):
mod2 <- glm(cbind(germinated, seeds-germinated) ~ host*variety,
            family=quasibinomial, data=orobanche)
summary(mod2)
# Note that dispersion parameter taken to be 1.862.
# Compare standard errors: SEs in quasi-binomial model =
# sqrt(1.862)*SEs in binomial model.
# Same residual deviance - why?
# Looks like interaction may not be significant.

# F-test:
mod2.red <- update(mod2, .~.-host:variety)
summary(mod2.red) 
anova(mod2.red, mod2, test = "F")

# F-statistic = 
# (difference in deviance)/(estimated dispersion parameter from full model)
F <- 6.4081/phihat
F
pf(F, 1, 17, lower.tail=FALSE)

# Compare full model to model with just host:
mod2.red2 <- update(mod2, .~.-variety-host:variety)
summary(mod2.red2)
anova(mod2.red2,mod2,test="F")

# --> Model with host alone is sufficient