##
## a model for fuel efficiency
##
## using mtcars (from the library MASS)
## Philippe De Brouwer
## 2017-07-01
##
library(MASS)
library(tree)
attach(mtcars)
# understand the data:
?mtcars
plot(mtcars)
# --> mpg seems to have a correlation with: cyl, disp, hp, drat, wt, qsec, vs, am, gear (though not linear) and carb
# so that is about everything.
# also note that some relations are not necessarily linear.
# Let's be bold
tr <- tree(mpg ~ . , data=mtcars)
tr
plot(tr); text(tr)
# this shows us that we might want to look into wt, cyl and hp
# this makes sens because drat is realated to wt, qsec to hp, etc.
# so let's try a linear model first
m.l.1 <- lm(mpg ~ wt + cyl + hp)
summary(m.l.1)
# so .. hp is not a good idea
m.l.2 <- lm(mpg ~ wt + cyl)
summary(m.l.2)
# all significant and R-squared is 0.8302
#
# note that summary(lm(mpg ~ .,data=mtcars)) has a higher R-squared (0.869),
# but is over-fitted (ie. it is has only weight with is significant different from zero)
#
# test now if the model is complete
x <- residuals(m.l.2)
qqnorm(x,col="red"); qqline(x,col="blue")
# not bad, though for the very high mpg (so economic cars, we seem to under-estimate their potential)
sum(x^2)
#[1] 191.172
# Hower this dependence might also become obvious in a Poisson regression
# Let's try
m.p.1 <- glm(mpg ~ wt + cyl + hp + disp, data = mtcars, family ="poisson")
summary(m.p.1)
# ... in the end only wt is relevant ... not good --> leave out the highest p-value (cyl)
# ... then disp
m.p.2 <- glm(mpg ~ wt + hp, data = mtcars, family ="poisson")
summary(m.p.2)
x <- residuals(m.p.2)
qqnorm(x,col="red"); qqline(x,col="blue")
exp(sum(x^2))
#[1] 911.3686
# Remember the nice non-linear dependence
plot(disp,mpg)
# so, let's try a non-linear model
m.nl.1 <- nls( mpg ~ a + c * cyl + w * wt + hp2 * hp^2 + hp1 * hp + d2 * disp^2 + d1 * disp,
start=list(a=0, c = 0.5, w = -3.5, hp2 = 0, hp1 = 0, d2 = 1, d1 = 1))
summary(m.nl.1)
# leave out the non significant variable cyl
m.nl.2 <- nls( mpg ~ a + w * wt + hp2 * hp^2 + hp1 * hp + d2 * disp^2 + d1 * disp,
start=list(a=0, w = -3.5, hp2 = 0, hp1 = 0, d2 = 1, d1 = 1))
summary(m.nl.2)
# hp^2 has less than 95% probability to be really different from zero.
# let's leave it out.
m.nl.3 <- nls( mpg ~ a + w * wt + hp1 * hp + d2 * disp^2 + d1 * disp,
start=list(a=0, w = -3.5, hp1 = 0, d2 = 1, d1 = 1))
summary(m.nl.3)
x <- residuals(m.nl.3)
qqnorm(x,col="red"); qqline(x,col="blue")
# bar 2 outliers (the best and the worst of the pack .. this seems better)
sum(x^2)
#[1] 138.3146
# this explains better the variance than m.l.2 and the quadratic dependence on disp is what
#we would have expected from the data.