Chapter 8 Logistic Regression
Full book chapter still delayed! Keeping up with writing every week is getting tough. Below are the notes from the video.
# load packages
library(tibble)
library(mlbench)
# single feature variable ######################################################
sim_logistic_data = function(sample_size = 25, beta_0 = -2, beta_1 = 3, factor = TRUE) {
x = rnorm(n = sample_size)
eta = beta_0 + beta_1 * x
p = 1 / (1 + exp(-eta))
y = rbinom(n = sample_size, size = 1, prob = p)
if (factor) {
y = factor(y)
}
tibble::tibble(x, y)
}
# simulate data for logistic regression
set.seed(3)
sim_data_factor = sim_logistic_data()
sim_data_factor
levels(sim_data_factor$y)
# simulate data for linear regression
set.seed(3)
sim_data_numeric = sim_logistic_data(factor = FALSE)
sim_data_numeric
# initial plot
plot(y ~ x, data = sim_data_numeric,
pch = 19, ylab = "Estimated Probability",
main = "Ordinary vs Logistic Regression",
ylim = c(-0.2, 1.2), cex = 1.5)
grid()
abline(h = 0, lty = 3)
abline(h = 1, lty = 3)
# E[Y | X = x] = 1 * P(Y = 1 | X = x) + 0 * P(Y = 0 | X = x) = P(Y = 1 | X = x)
# ordinary linear regression
fit_lm = lm(y ~ x, data = sim_data_numeric)
fit_lm = glm(y ~ x, data = sim_data_numeric)
# logistic regression
fit_glm = glm(y ~ x, data = sim_data_numeric, family = binomial)
fit_glm = glm(y ~ x, data = sim_data_numeric, family = binomial(link = "logit"))
# plot results
plot(y ~ x, data = sim_data_numeric,
pch = 19, ylab = "Estimated Probability",
main = "Ordinary vs Logistic Regression",
ylim = c(-0.2, 1.2), cex = 1.5)
grid()
abline(h = 0, lty = 3)
abline(h = 1, lty = 3)
abline(fit_lm, col = "darkorange")
curve(predict(fit_glm, data.frame(x), type = "response"),
add = TRUE, col = "dodgerblue", lty = 2)
legend("topleft", c("Ordinary", "Logistic", "Data"), lty = c(1, 2, 0),
pch = c(NA, NA, 20), lwd = 2, col = c("darkorange", "dodgerblue", "black"))
abline(h = 0.5, lty = 2)
# abline(v = -coef(fit_glm)[1] / coef(fit_glm)[2], col = "dodgerblue", lty = 2)
# abline(v = (0.5 - coef(fit_lm)[1]) / coef(fit_lm)[2], col = "darkorange")
# two feature variables, blobs #################################################
# simulate data
set.seed(42)
blob_trn = as_tibble(mlbench.2dnormals(n = 100))
blob_tst = as_tibble(mlbench.2dnormals(n = 1000))
# check data
blob_trn
levels(blob_trn$classes)
# check balance
table(blob_trn$classes)
# initial plot
plot(x.2 ~ x.1, data = blob_trn, col = blob_trn$classes, pch = 19)
grid()
# points where we will predict
xy_vals = expand.grid(
x.1 = seq(from = -3.5, to = 3.5, by = 0.05),
x.2 = seq(from = -3.5, to = 3.5, by = 0.05)
)
head(xy_vals)
# fit model, bad
mod = glm(classes ~ ., data = blob_trn, family = binomial)
pred_xy = ifelse(predict(mod, xy_vals, type = "response") > 0.5, "lightpink", "lightgrey")
pred_xy = ifelse(predict(mod, xy_vals) > 0, "lightpink", "lightgrey")
# check predictions on plot
plot(x.2 ~ x.1, data = xy_vals, col = pred_xy,
xlim = c(-3, 3), ylim = c(-3, 3), pch = 15)
points(x.2 ~ x.1, data = blob_trn, col = blob_trn$classes, pch = 19)
# 0 = beta_0 + beta_1 x.1 + beta_2 x.2
# x.2 = -(beta_0 + beta_1 x.1) / beta_2
# add analytic decision boundary
plot(x.2 ~ x.1, data = xy_vals, col = pred_xy,
xlim = c(-3, 3), ylim = c(-3, 3), pch = 15)
points(x.2 ~ x.1, data = blob_trn, col = blob_trn$classes, pch = 19)
abline(
a = -coef(mod)[1] / coef(mod)[3],
b = -coef(mod)[2] / coef(mod)[3],
lwd = 5, col = "white")
# check performance, miclassification
pred = factor(ifelse(predict(mod, blob_tst) > 0.0, "2", "1"))
mean(blob_tst$classes != pred)
## two variables, circle #######################################################
# simulate data
set.seed(42)
circle_trn = as_tibble(mlbench.circle(n = 250))
circle_tst = as_tibble(mlbench.circle(n = 1000))
# check data
circle_trn
# check balance
table(circle_trn$classes)
# initial plot
plot(x.2 ~ x.1, data = circle_trn, col = circle_trn$classes, pch = 19)
grid()
# points where we will predict
xy_vals = expand.grid(
x.1 = seq(from = -1.1, to = 1.1, by = 0.01),
x.2 = seq(from = -1.1, to = 1.1, by = 0.01)
)
head(xy_vals)
# fit model, bad
mod_bad = glm(classes ~ ., data = circle_trn, family = binomial)
pred_bad_xy = ifelse(predict(mod_bad, xy_vals, type = "response") > 0.5, "lightpink", "lightgrey")
# check predictions on plot
plot(x.2 ~ x.1, data = xy_vals, col = pred_bad_xy,
xlim = c(-1, 1), ylim = c(-1, 1), pch = 15)
points(x.2 ~ x.1, data = circle_trn, col = circle_trn$classes, pch = 19)
# check performance, accuracy
pred_bad = factor(ifelse(predict(mod_bad, circle_tst) > 0.0, "2", "1"))
mean(circle_tst$classes == pred_bad)
# fit model, good
mod_good = glm(classes ~ poly(x.1, 2) + poly(x.2, 2) + x.1:x.2, data = circle_trn, family = binomial)
pred_good_xy = ifelse(predict(mod_good, xy_vals, type = "response") > 0.5, "lightpink", "lightgrey")
# check predictions on plot
plot(x.2 ~ x.1, data = xy_vals, col = pred_good_xy,
xlim = c(-1, 1), ylim = c(-1, 1), pch = 15)
points(x.2 ~ x.1, data = circle_trn, col = circle_trn$classes, pch = 19)
# check performance, accuracy
pred_good = factor(ifelse(predict(mod_good, circle_tst) > 0.5, "2", "1"))
mean(circle_tst$classes == pred_good)