## ## a model for fuel efficiency ## ## using mtcars (from the library MASS) ## Philippe De Brouwer ## 2017-07-01 ## library(MASS) library(tree) attach(mtcars) # understand the data: ?mtcars plot(mtcars) # --> mpg seems to have a correlation with: cyl, disp, hp, drat, wt, qsec, vs, am, gear (though not linear) and carb # so that is about everything. # also note that some relations are not necessarily linear. # Let's be bold tr <- tree(mpg ~ . , data=mtcars) tr plot(tr); text(tr) # this shows us that we might want to look into wt, cyl and hp # this makes sens because drat is realated to wt, qsec to hp, etc. # so let's try a linear model first m.l.1 <- lm(mpg ~ wt + cyl + hp) summary(m.l.1) # so .. hp is not a good idea m.l.2 <- lm(mpg ~ wt + cyl) summary(m.l.2) # all significant and R-squared is 0.8302 # # note that summary(lm(mpg ~ .,data=mtcars)) has a higher R-squared (0.869), # but is over-fitted (ie. it is has only weight with is significant different from zero) # # test now if the model is complete x <- residuals(m.l.2) qqnorm(x,col="red"); qqline(x,col="blue") # not bad, though for the very high mpg (so economic cars, we seem to under-estimate their potential) sum(x^2) #[1] 191.172 # Hower this dependence might also become obvious in a Poisson regression # Let's try m.p.1 <- glm(mpg ~ wt + cyl + hp + disp, data = mtcars, family ="poisson") summary(m.p.1) # ... in the end only wt is relevant ... not good --> leave out the highest p-value (cyl) # ... then disp m.p.2 <- glm(mpg ~ wt + hp, data = mtcars, family ="poisson") summary(m.p.2) x <- residuals(m.p.2) qqnorm(x,col="red"); qqline(x,col="blue") exp(sum(x^2)) #[1] 911.3686 # Remember the nice non-linear dependence plot(disp,mpg) # so, let's try a non-linear model m.nl.1 <- nls( mpg ~ a + c * cyl + w * wt + hp2 * hp^2 + hp1 * hp + d2 * disp^2 + d1 * disp, start=list(a=0, c = 0.5, w = -3.5, hp2 = 0, hp1 = 0, d2 = 1, d1 = 1)) summary(m.nl.1) # leave out the non significant variable cyl m.nl.2 <- nls( mpg ~ a + w * wt + hp2 * hp^2 + hp1 * hp + d2 * disp^2 + d1 * disp, start=list(a=0, w = -3.5, hp2 = 0, hp1 = 0, d2 = 1, d1 = 1)) summary(m.nl.2) # hp^2 has less than 95% probability to be really different from zero. # let's leave it out. m.nl.3 <- nls( mpg ~ a + w * wt + hp1 * hp + d2 * disp^2 + d1 * disp, start=list(a=0, w = -3.5, hp1 = 0, d2 = 1, d1 = 1)) summary(m.nl.3) x <- residuals(m.nl.3) qqnorm(x,col="red"); qqline(x,col="blue") # bar 2 outliers (the best and the worst of the pack .. this seems better) sum(x^2) #[1] 138.3146 # this explains better the variance than m.l.2 and the quadratic dependence on disp is what #we would have expected from the data.