# Data from Collett, 1991: # Measure proportion of seeds of two species of Orobanche # that germinate in bean and cucumber root extracts (hosts). # Built-in dataset in the dispmod library # You may need to first install this package - uncomment # the following line: # install.packages("dispmod", install.dependencies = TRUE) library(dispmod) data(orobanche) ?orobanche # Data description View(orobanche) # Both predictors are categorical -> visualize? orobanche$prop <- orobanche$germinated/orobanche$seeds boxplot(prop ~ host + variety, data = orobanche) library(yarrr) pirateplot(prop ~ host + variety, inf.method="ci", theme=2, point.o = 1, data=orobanche, ylab="Proportion Germinated") pirateplot(prop ~ variety + host, inf.method="ci", theme=2, point.o = 1, data=orobanche, ylab="Proportion Germinated") total_germinated <- with(orobanche, tapply(germinated, list(host,variety), sum) ) total_seeds <- with(orobanche, tapply(seeds, list(host,variety), sum) ) props <- total_germinated/total_seeds barplot(props, beside = TRUE, legend.text = TRUE, xlab = "Variety", ylab = "Proportion Germinated") barplot(t(props), beside = TRUE, legend.text = TRUE, xlab = "Host", ylab = "Proportion Germinated") # Note that we have binomial data, so we can fit # the model either as proportions or binomial counts: mod1 <- glm(germinated/seeds ~ host*variety, family=binomial, weights=seeds, data=orobanche) # equivalently, mod1 <- glm(cbind(germinated, seeds-germinated) ~ host*variety, family=binomial, data=orobanche) summary(mod1) # Lack of fit, but not clear how to amend systematic component: pchisq(33.278, 17, lower.tail=FALSE) # No extreme residuals: residuals(mod1) plot(mod1,1) # Calculate Pearson's estimate of phi presid <- residuals(mod1, type="pearson") pChi2 <- sum(presid^2) phihat <- pChi2/17 # Estimate of overdispersion parameter: phihat # Variance is almost twice as large than what we'd expect if # the data were truely binomial. # Re-fit the model accounting for overdispersion (quasi-likelihood): mod2 <- glm(cbind(germinated, seeds-germinated) ~ host*variety, family=quasibinomial, data=orobanche) summary(mod2) # Note that dispersion parameter taken to be 1.862. # Compare standard errors: SEs in quasi-binomial model = # sqrt(1.862)*SEs in binomial model. # Same residual deviance - why? # Looks like interaction may not be significant. # F-test: mod2.red <- update(mod2, .~.-host:variety) summary(mod2.red) anova(mod2.red, mod2, test = "F") # F-statistic = # (difference in deviance)/(estimated dispersion parameter from full model) F <- 6.4081/phihat F pf(F, 1, 17, lower.tail=FALSE) # Compare full model to model with just host: mod2.red2 <- update(mod2, .~.-variety-host:variety) summary(mod2.red2) anova(mod2.red2,mod2,test="F") # --> Model with host alone is sufficient