Below are the solutions to these exercises on Regression Modeling with the Boston Housing dataset.
############### # # # Exercise 1 # # # ############### library(mlbench) library(dplyr) library(ggplot2) library(reshape2) data("BostonHousing") housing <- BostonHousing str(housing)
## 'data.frame': 506 obs. of 14 variables: ## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ... ## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ... ## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ... ## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ... ## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ... ## $ rm : num 6.58 6.42 7.18 7 7.15 ... ## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ... ## $ dis : num 4.09 4.97 4.97 6.06 6.06 ... ## $ rad : num 1 2 2 3 3 3 5 5 5 5 ... ## $ tax : num 296 242 242 222 222 222 311 311 311 311 ... ## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ... ## $ b : num 397 397 393 395 397 ... ## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ... ## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
############### # # # Exercise 2 # # # ############### housing %>% ggplot(aes(x = medv)) + stat_density() + labs(x = "Median Value ($1000s)", y = "Density", title = "Density Plot of Median Value House Price in Boston") + theme_minimal()

summary(housing$medv)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 5.00 17.02 21.20 22.53 25.00 50.00
############### # # # Exercise 3 # # # ############### housing %>% select(c(crim, rm, age, rad, tax, lstat, medv)) %>% melt(, id.vars = "medv") %>% ggplot(aes(x = value, y = medv, colour = variable)) + geom_point(alpha = 0.7) + stat_smooth(aes(colour = "black")) + facet_wrap(~variable, scales = "free", ncol = 2) + labs(x = "Variable Value", y = "Median House Price ($1000s)") + theme_minimal()

############### # # # Exercise 4 # # # ############### library("caret") set.seed(123) to_train <- createDataPartition(y = housing$medv, p = 0.75, list = FALSE) train <- housing[to_train, ] test <- housing[-to_train, ] ############### # # # Exercise 5 # # # ############### first_lm <- lm(medv ~ crim + rm + tax + lstat, data = train) ############### # # # Exercise 6 # # # ############### lm1_rsqu <- summary(first_lm)$r.squared print(paste("First linear model has an r-squared value of ", round(lm1_rsqu, 3), sep = ""))
## [1] "First linear model has an r-squared value of 0.672"
#plot(first_lm) ############### # # # Exercise 7 # # # ############### second_lm <- lm(log(medv) ~ crim + rm + tax + lstat, data = train) ############### # # # Exercise 8 # # # ############### lm2_rsqu <- summary(second_lm)$r.squared print(paste("Our second linear model has an r-squared value of ", round(lm2_rsqu, 3), sep = ""))
## [1] "Our second linear model has an r-squared value of 0.735"
#plot(second_lm) mean(second_lm$residuals)
## [1] -4.838857e-18
############### # # # Exercise 9 # # # ############### predicted <- predict(second_lm, newdata = test) results <- data.frame(predicted = exp(predicted), original = test$medv) ################ # # # Exercise 10 # # # ################ results %>% ggplot(aes(x = predicted, y = original)) + geom_point() + stat_smooth() + labs(x = "Predicted Values", y = "Original Values", title = "Predicted vs. Original Values") + theme_minimal()

Leave a Reply