Bonus code
Bonus code 
The following code was added thanks to questions from course participants of past sessions. They might be useful for you too.
Linear Regression using model selection
Linear regression
library(MASS)
data(birthwt)
summary(birthwt)
help(birthwt)
colnames(birthwt)
colnames(birthwt) <- c("birthwt.below.2500", "mother.age","mother.weight", "race",
"smoking.status", "nb.previous.prem.labor", "hypertension",
"uterine.irrit","nb.physician.visits", "birthwt.grams")
str(birthwt)
summary(birthwt)
birthwt$race <- as.factor(birthwt$race)
str(birthwt)
summary(birthwt)
Model selection
library(leaps)
best_subset <- regsubsets(birthwt.grams ~ . - birthwt.below.2500, data = birthwt, nvmax = 8)
results <- summary(best_subset)
# Adjusted R-squared
plot(results$adjr2, xlab = "Number of Variables", ylab = "Adjusted R-squared", type = "l")
# Residual sum of squares for each model
plot(results$rss, xlab = "Number of Variables", ylab = "RSS", type = "l")
# R-squared
plot(results$rsq, xlab = "Number of Variables", ylab = "R-squared", type = "l")
which.max(results$adjr2)
Model selection using the validation set approach
set.seed(1)
train <- sample(c(TRUE, FALSE), size = nrow(birthwt), rep = TRUE)
test <- (!train)
best_subset_train <- regsubsets(birthwt.grams ~ . - birthwt.below.2500, data = birthwt[train ,], nvmax = 8)
test_mat <- model.matrix(birthwt.grams ~ . - birthwt.below.2500, data = birthwt[test,])
val_errors <- rep(NA , 8)
for(i in 1:8){
coefi = coef(best_subset_train, id = i)
pred = test_mat[,names(coefi)]%*%coefi
val_errors[i] = mean((birthwt$birthwt.grams[test] - pred)^2)
}
which.min(val_errors)