# Chapter 8 Logistic Regression

Full book chapter still delayed! Keeping up with writing every week is getting tough. Below are the notes from the video.

# load packages
library(tibble)
library(mlbench)

# single feature variable ######################################################

sim_logistic_data = function(sample_size = 25, beta_0 = -2, beta_1 = 3, factor = TRUE) {
x = rnorm(n = sample_size)
eta = beta_0 + beta_1 * x
p = 1 / (1 + exp(-eta))
y = rbinom(n = sample_size, size = 1, prob = p)
if (factor) {
y = factor(y)
}
tibble::tibble(x, y)
}

# simulate data for logistic regression
set.seed(3)
sim_data_factor = sim_logistic_data()
sim_data_factor
levels(sim_data_factor$y) # simulate data for linear regression set.seed(3) sim_data_numeric = sim_logistic_data(factor = FALSE) sim_data_numeric # initial plot plot(y ~ x, data = sim_data_numeric, pch = 19, ylab = "Estimated Probability", main = "Ordinary vs Logistic Regression", ylim = c(-0.2, 1.2), cex = 1.5) grid() abline(h = 0, lty = 3) abline(h = 1, lty = 3) # E[Y | X = x] = 1 * P(Y = 1 | X = x) + 0 * P(Y = 0 | X = x) = P(Y = 1 | X = x) # ordinary linear regression fit_lm = lm(y ~ x, data = sim_data_numeric) fit_lm = glm(y ~ x, data = sim_data_numeric) # logistic regression fit_glm = glm(y ~ x, data = sim_data_numeric, family = binomial) fit_glm = glm(y ~ x, data = sim_data_numeric, family = binomial(link = "logit")) # plot results plot(y ~ x, data = sim_data_numeric, pch = 19, ylab = "Estimated Probability", main = "Ordinary vs Logistic Regression", ylim = c(-0.2, 1.2), cex = 1.5) grid() abline(h = 0, lty = 3) abline(h = 1, lty = 3) abline(fit_lm, col = "darkorange") curve(predict(fit_glm, data.frame(x), type = "response"), add = TRUE, col = "dodgerblue", lty = 2) legend("topleft", c("Ordinary", "Logistic", "Data"), lty = c(1, 2, 0), pch = c(NA, NA, 20), lwd = 2, col = c("darkorange", "dodgerblue", "black")) abline(h = 0.5, lty = 2) # abline(v = -coef(fit_glm) / coef(fit_glm), col = "dodgerblue", lty = 2) # abline(v = (0.5 - coef(fit_lm)) / coef(fit_lm), col = "darkorange") # two feature variables, blobs ################################################# # simulate data set.seed(42) blob_trn = as_tibble(mlbench.2dnormals(n = 100)) blob_tst = as_tibble(mlbench.2dnormals(n = 1000)) # check data blob_trn levels(blob_trn$classes)

# check balance
table(blob_trn$classes) # initial plot plot(x.2 ~ x.1, data = blob_trn, col = blob_trn$classes, pch = 19)
grid()

# points where we will predict
xy_vals = expand.grid(
x.1 = seq(from = -3.5, to = 3.5, by = 0.05),
x.2 = seq(from = -3.5, to = 3.5, by = 0.05)
)

mod = glm(classes ~ ., data = blob_trn, family = binomial)
pred_xy = ifelse(predict(mod, xy_vals, type = "response") > 0.5, "lightpink", "lightgrey")
pred_xy = ifelse(predict(mod, xy_vals) > 0, "lightpink", "lightgrey")

# check predictions on plot
plot(x.2 ~ x.1, data = xy_vals, col = pred_xy,
xlim = c(-3, 3), ylim = c(-3, 3), pch = 15)
points(x.2 ~ x.1, data = blob_trn, col = blob_trn$classes, pch = 19) # 0 = beta_0 + beta_1 x.1 + beta_2 x.2 # x.2 = -(beta_0 + beta_1 x.1) / beta_2 # add analytic decision boundary plot(x.2 ~ x.1, data = xy_vals, col = pred_xy, xlim = c(-3, 3), ylim = c(-3, 3), pch = 15) points(x.2 ~ x.1, data = blob_trn, col = blob_trn$classes, pch = 19)
abline(
a = -coef(mod) / coef(mod),
b = -coef(mod) / coef(mod),
lwd = 5, col = "white")

# check performance, miclassification
pred = factor(ifelse(predict(mod, blob_tst) > 0.0, "2", "1"))
mean(blob_tst$classes != pred) ## two variables, circle ####################################################### # simulate data set.seed(42) circle_trn = as_tibble(mlbench.circle(n = 250)) circle_tst = as_tibble(mlbench.circle(n = 1000)) # check data circle_trn # check balance table(circle_trn$classes)

# initial plot
plot(x.2 ~ x.1, data = circle_trn, col = circle_trn$classes, pch = 19) grid() # points where we will predict xy_vals = expand.grid( x.1 = seq(from = -1.1, to = 1.1, by = 0.01), x.2 = seq(from = -1.1, to = 1.1, by = 0.01) ) head(xy_vals) # fit model, bad mod_bad = glm(classes ~ ., data = circle_trn, family = binomial) pred_bad_xy = ifelse(predict(mod_bad, xy_vals, type = "response") > 0.5, "lightpink", "lightgrey") # check predictions on plot plot(x.2 ~ x.1, data = xy_vals, col = pred_bad_xy, xlim = c(-1, 1), ylim = c(-1, 1), pch = 15) points(x.2 ~ x.1, data = circle_trn, col = circle_trn$classes, pch = 19)

# check performance, accuracy
mean(circle_tst$classes == pred_bad) # fit model, good mod_good = glm(classes ~ poly(x.1, 2) + poly(x.2, 2) + x.1:x.2, data = circle_trn, family = binomial) pred_good_xy = ifelse(predict(mod_good, xy_vals, type = "response") > 0.5, "lightpink", "lightgrey") # check predictions on plot plot(x.2 ~ x.1, data = xy_vals, col = pred_good_xy, xlim = c(-1, 1), ylim = c(-1, 1), pch = 15) points(x.2 ~ x.1, data = circle_trn, col = circle_trn$classes, pch = 19)
mean(circle_tst\$classes == pred_good)