### Multinomial Logistic Regression # Load required libraries library(yarrr) # pirateplot library(nnet) # multinom source("http://www.math.montana.edu/shancock/courses/stat439/R/summ.mfit.R") # summ.mfit ### Data: Greene (1995) ## Travel Choices: # Measure travelers' decisions regarding travel mode # between Sydney and Melbourne, Australia. # Modes of travel (choices): # mode = 0 --> air # mode = 1 --> train # mode = 2 --> bus # mode = 3 --> car # (Note that this is a nominal response variable - # the numbers have no quantitative meaning) ## Explanatory variables of interest: # hinc = household income ($k) # psize = number of individuals traveling in a party # Can you think of other unmeasured confounders? ## Read in data and attach: travel <- read.table("http://www.math.montana.edu/shancock/data/TravelChoices.txt",header=T) travel$mode.fac <- factor(travel$mode,levels=c(0:3),labels=c("Air","Train","Bus","Car")) head(travel) dim(travel) # Sample size = 210 ## #### Some descriptive statistics: ## # What proportion are in each travel category? with(travel, cbind(Frequency = table(mode.fac), Proportion = table(mode.fac)/sum(table(mode.fac))) ) barplot(table(travel$mode.fac), ylab = "Frequency", xlab = "Mode of Travel") # --> travel by train most common (30%), then # car (28%) # air (27.6%) # bus (14.2%) # Travel mode vs. party size? my.tab <- with(travel, table(mode.fac,psize)) my.tab pirateplot(psize ~ mode.fac, data = travel, avg.line.fun = median, inf.method = "iqr") barplot(prop.table(my.tab, 2), legend = TRUE, xlab = "Party Size") mosaicplot(psize ~ mode.fac, data = travel) # As psize increases, probability of traveling by air decreases, and probability of traveling by car increases. # Note sparse data for large party sizes (only one observation with psize = 6) # Travel mode vs. hinc? pirateplot(hinc ~ mode.fac, ylab="Household Income ($1000)", xlab = "Mode of Travel", data = travel, inf.method = "iqr", avg.line.fun = median) lapply( split(travel$hinc,travel$mode.fac), FUN=summary) # Median household income among travelers choosing air or car appears # to be higher than those traveling by train or bus. ## #### Fit multinomial logistic regression model ## # Use multinom function in nnet library mfit <- multinom(mode.fac ~ hinc + psize, data=travel) summary(mfit) # Not too helpful summ.mfit(mfit) # rrr = "relative risk ratio" = odds of level ___ to level ___ # Interpretations of rrr for psize in each equation? # Plot fitted probabilities vs. hinc for given party size: co <- coef(mfit) # Note is a 3x3 matrix is.matrix(co) dim(co) # Function to calculate probabilities my.prob <- function(c, ps, x, mode){ # c = 3x3 matrix of coefficients # ps = party size # x = hinc # mode = "Air","Train","Bus", or "Car" denom <- 1+exp(c[1,1]+c[1,2]*x+c[1,3]*ps) + exp(c[2,1]+c[2,2]*x+c[2,3]*ps) + exp(c[3,1]+c[3,2]*x+c[3,3]*ps) if(mode == "Air"){ rslt <- 1/denom } if(mode == "Train"){ rslt <- exp(c[1,1]+c[1,2]*x+c[1,3]*ps)/denom } if(mode == "Bus"){ rslt <- exp(c[2,1]+c[2,2]*x+c[2,3]*ps)/denom } if(mode == "Car"){ rslt <- exp(c[3,1]+c[3,2]*x+c[3,3]*ps)/denom } return(rslt) } # For party size = 2 curve(my.prob(co, 2, x, "Air"), from=0, to=80, xlab="Household Income ($k)", ylab="Fitted Probabilities", main="Fitted Probabilities of Travel Type for Parties of Size 2", col="red", lty=1) curve(my.prob(co, 2, x, "Train"), add=T, col="blue",lty=2) curve(my.prob(co, 2, x, "Bus"), add=T, col="darkgreen",lty=3) curve(my.prob(co, 2, x, "Car"), add=T, col="purple",lty=4) legend("topleft",c("Air","Train","Bus","Car"), col=c("red","blue","darkgreen","purple"), lty=c(1,2,3,4)) # Any interaction between psize and hinc on mode? mfit.int <- multinom( mode.fac ~ hinc * psize, data=travel) summ.mfit(mfit.int) anova(mfit,mfit.int) # Conclusion? ### Fitting separate logistic models to each (less efficient): # Compare to multinomial coefs. # Train vs. Air: mod.TA <- glm( (mode==1) ~ hinc+psize, data=travel, family=binomial, subset = (mode==0 | mode==1)) # The logical argument "mode==0 | mode==1" tells R to only take observations # from the data set that have mode 0 OR (|) mode 1. # Bus vs. Air: mod.BA <- glm( (mode==2) ~ hinc+psize, data=travel, family=binomial, subset = (mode==0 | mode==2)) # Car vs. Air: mod.CA <- glm( (mode==3) ~ hinc+psize, data=travel, family=binomial, subset = (mode==0 | mode==3)) mod.TA mod.BA mod.CA ## Only the intercept term for Bus vs. Air seems to differ from ## the multinomial fit.