Interest lies in explaining price, distinguishing models according to price, depreciation with age and mileage, possible varying depreciation rates across models of car, collinearity issues for age and mileage, possible vendor differences on asking prices, outliers, influential cases, predicting prices
Download data from here
Data columns:
# Read in pollution data
merc <- read.table("mercedes")
i <- order(merc[,5]); merc <- merc[i,] # Sortin order of mileage
y <- log(merc[,2]) # Why *log* price? (NB: # not $)
mod <- merc[,3] # Model: 0=500, 1=450, ..., 4=200
age <- merc[,4] # Age in units of 6-mo
mile <- merc[,5] # Mileage in thousands
vend <- merc[,6] # Four different vendors
Mod <- as.factor(mod) # Treat as category, not number
plot( mile, y,
xlab="Miles",ylab="Log Price",type='n')
# fit lines for 5 different models
for (j in 0:4) {
ok <- mod==j; # ok is a boolean variable
points(mile[ok], y[ok], pch=paste(j), col=j+1);
fit <- lm(y[ok]~mile[ok]);
abline(fit, col=j+1);
}
# fit the line with Mod as a factor variable
fit <- lm( y~ Mod + mile , x=T)
names(fit)
print(fit)
summary(fit)
plot( mile, y, xlab="Miles",ylab="Log Price",type='n')
for (j in 0:4) {
ok <- mod==j;
points(mile[ok],y[ok],pch=paste(j),col=j+1);
lines(mile[ok],fit$fitted.values[ok],col=j+1 );
}
# now predictions ..
pre <- predict.lm(fit,se.fit=T)
plot( mile, y, xlab="Miles",ylab="Log Price",type='n')
for (j in 0:4) {
ok<-mod==j;
points(mile[ok],y[ok],pch=paste(j),col=j+1)
lines(mile[ok],pre$fit[ok],col=j+1)
}
# .. and again with pointwise 95% intervals ...
# compute mean and S.E. of predictive distns
# look at the summary(fit) to see its residual deviance
# $residual.scale: [1] 0.1360028
# this is the estimate of variance at the new point
ss <- 0.136^2;
# for honest prediction we have to also add the uncertainty about the
# line at the point
vy <- sqrt(ss+pre$se.fit^2)
# and plot ....
par(mfrow=c(3,2))
for (j in 0:4) {
plot( mile, y, xlab="Miles",ylab="Log Price",type='n')
ok<-mod==j;
points(mile[ok],y[ok],pch=paste(j),col=j+1)
lines(mile[ok],pre$fit[ok],col=j+1)
lines(mile[ok],pre$fit[ok]+2*vy[ok],col=j+1,lty=j+1)
lines(mile[ok],pre$fit[ok]-2*vy[ok],col=j+1,lty=j+1)
}
You may want to play around with the actual price and see why we
prefer log(price) here