Below are the solutions to these exercises on Logistic regression in R.
#################### # # # Exercise 1 # # # #################### library(MASS) library(ggplot2) train <- rbind(Pima.tr, Pima.tr2) test <- Pima.te train$type <- as.integer(train$type) - 1L test$type <- as.integer(test$type) - 1L # Missing values? sapply(train, function(x) sum(is.na(x)))
## npreg glu bp skin bmi ped age type ## 0 0 13 98 3 0 0 0
#################### # # # Exercise 2 # # # #################### pairs(subset(train, select = -c(type)), col = as.factor(train$type))
ggplot(train, aes(x = age, y = type)) + geom_jitter(width = 0.5, height = 0.07, alpha = .2) + geom_smooth(method = "glm", se = FALSE, method.args = list(family = "binomial")) + labs(y = expression(hat(P)(Diabetic)))
#################### # # # Exercise 3 # # # #################### lg1 <- glm(type ~ age + bmi, data = train, family = binomial) summary(lg1)$coefficients[, c(1, 4)]
## Estimate Pr(>|z|) ## (Intercept) -5.79828987 3.019829e-17 ## age 0.05383783 2.628863e-09 ## bmi 0.10256607 2.972812e-09
#################### # # # Exercise 4 # # # #################### predict(lg1, type = "response", newdata = data.frame(bmi = c(32, 22), age = 35))
## 1 2 ## 0.3470908 0.1600962
# Or manually # Define logistic function lgs_fun <- function(par, x) { 1 / (1 + exp(-x %*% par)) # x %*% par is linear algebra equivalent of b_0 + b_1*age + b2*bmi } lgs_fun(lg1$coefficients, c(1, 35, 32))
## [,1] ## [1,] 0.3470908
lgs_fun(lg1$coefficients, c(1, 35, 22))
## [,1] ## [1,] 0.1600962
#################### # # # Exercise 5 # # # ################### # Literally lgs_fun(lg1$coefficients, c(1, 55, 37)) / (1 - lgs_fun(lg1$coefficients, c(1, 55, 37)))
## [,1] ## [1,] 2.605789
# or simply exp(c(1, 55, 37) %*% lg1$coefficients)
## [,1] ## [1,] 2.605789
# Or from R exp(predict(lg1, response = "link", newdata = data.frame(age = 55, bmi = 37)))
## 1 ## 2.605789
#################### # # # Exercise 6 # # # #################### # Remember data is not complete ... we have missing values for bmi... cm1 <- table(train$type[!is.na(train$bmi)], predict(lg1, type = "response") >= 0.5) cm1
## ## FALSE TRUE ## 0 276 48 ## 1 114 59
# Prediction accuracy sum(diag(cm1)) / sum(cm1) # 67 percent
## [1] 0.6740443
#################### # # # Exercise 7 # # # #################### # Check test set for missing values sapply(test, function(x) sum(is.na(x))) # complete
## npreg glu bp skin bmi ped age type ## 0 0 0 0 0 0 0 0
# Get predictions test_pred <- predict(lg1, type = "response", newdata = test) # Or manually test_predm <- lgs_fun(lg1$coefficients, as.matrix(cbind(1, test[, c("age", "bmi")]))) cm1_test <- table(test$type, test_pred >= 0.5) cm1_test
## ## FALSE TRUE ## 0 195 28 ## 1 67 42
# Prediction accuracy sum(diag(cm1_test)) / sum(cm1_test) # 71.4 percent
## [1] 0.7138554
#################### # # # Exercise 8 # # # #################### library(ROCR) # first install.packages("ROCR")
## Warning: package 'ROCR' was built under R version 3.4.2
# ROC curve predROCR <- prediction(test_predm, test$type) perfROCR <- performance(predROCR, "tpr", "fpr") plot(perfROCR, colorize = TRUE)
# Compute AUC performance(predROCR, "auc")@y.values
## [[1]] ## [1] 0.7558522
#################### # # # Exercise 9 # # # #################### lg2 <- glm(type ~ age + bmi + npreg, data = train, x = TRUE, family = binomial) test_pred <- predict(lg2, type = "response", newdata = test) predROCR <- prediction(test_pred, test$type) perfROCR <- performance(predROCR, "tpr", "fpr") plot(perfROCR, colorize = TRUE)
# Compute AUC performance(predROCR, "auc")@y.values
## [[1]] ## [1] 0.7606451
#################### # # # Exercise 10 # # # #################### ex10data <- data.frame(age = 35, bmi = c(25, 35), npreg = 2) diff(predict(lg2, type = "response", newdata = ex10data)) # 20 percentage points
## 2 ## 0.2017908
diff(predict(lg2, type = "response", newdata = ex10data)) / predict(lg2, type = "response", newdata = ex10data)[1] # 113 percent increase!
## 2 ## 1.138777
# Turns out the formula for marginal effect is not complicated # Although it involves some calculus and algebra # P(y = 1)(1 - P(y = 1)) * ß_bmi # Marginal effect at the age = 35, bmi = 25, npreg = 2 p <- lgs_fun(lg2$coefficients, c(1, 35, 25, 2)) p*(1 - p)*lg2$coefficients[3] # 0.01518653
## [,1] ## [1,] 0.01518653
Leave a Reply