Interest lies in explaining price, distinguishing models according to price, depreciation with age and mileage, possible varying depreciation rates across models of car, collinearity issues for age and mileage, possible vendor differences on asking prices, outliers, influential cases, predicting prices
Download data from here
Data columns:
# Read in pollution data merc <- read.table("mercedes") i <- order(merc[,5]); merc <- merc[i,] # Sortin order of mileage y <- log(merc[,2]) # Why *log* price? (NB: # not $) mod <- merc[,3] # Model: 0=500, 1=450, ..., 4=200 age <- merc[,4] # Age in units of 6-mo mile <- merc[,5] # Mileage in thousands vend <- merc[,6] # Four different vendors Mod <- as.factor(mod) # Treat as category, not number plot( mile, y, xlab="Miles",ylab="Log Price",type='n') # fit lines for 5 different models for (j in 0:4) { ok <- mod==j; # ok is a boolean variable points(mile[ok], y[ok], pch=paste(j), col=j+1); fit <- lm(y[ok]~mile[ok]); abline(fit, col=j+1); } # fit the line with Mod as a factor variable fit <- lm( y~ Mod + mile , x=T) names(fit) print(fit) summary(fit) plot( mile, y, xlab="Miles",ylab="Log Price",type='n') for (j in 0:4) { ok <- mod==j; points(mile[ok],y[ok],pch=paste(j),col=j+1); lines(mile[ok],fit$fitted.values[ok],col=j+1 ); } # now predictions .. pre <- predict.lm(fit,se.fit=T) plot( mile, y, xlab="Miles",ylab="Log Price",type='n') for (j in 0:4) { ok<-mod==j; points(mile[ok],y[ok],pch=paste(j),col=j+1) lines(mile[ok],pre$fit[ok],col=j+1) } # .. and again with pointwise 95% intervals ... # compute mean and S.E. of predictive distns # look at the summary(fit) to see its residual deviance # $residual.scale: [1] 0.1360028 # this is the estimate of variance at the new point ss <- 0.136^2; # for honest prediction we have to also add the uncertainty about the # line at the point vy <- sqrt(ss+pre$se.fit^2) # and plot .... par(mfrow=c(3,2)) for (j in 0:4) { plot( mile, y, xlab="Miles",ylab="Log Price",type='n') ok<-mod==j; points(mile[ok],y[ok],pch=paste(j),col=j+1) lines(mile[ok],pre$fit[ok],col=j+1) lines(mile[ok],pre$fit[ok]+2*vy[ok],col=j+1,lty=j+1) lines(mile[ok],pre$fit[ok]-2*vy[ok],col=j+1,lty=j+1) }You may want to play around with the actual price and see why we prefer log(price) here