Data Specialization Course
Videos: search Brian Caffo on YouTube
Code: https://github.com/DataScienceSpecialization/courses and
Notes: http://datasciencespecialization.github.io/
ISLR Videos and Notes https://www.dataschool.io/15-hours-of-expert-machine-learning-videos/
library(UsingR)
library(ggplot2)
# generate some fake data
set.seed(1234)
beta <- 2
intercept <- 10
n <- 50
m <- 10
s <- 10
noise <- rnorm(n, mean = m, sd = s)
observed <- runif(n = 50, min = 1, max = 100 )
outcome <- beta*observed + rep(intercept, n) + noise
fake <- as.data.frame(cbind(observed, outcome))
# plot the data
plot(fake$observed, fake$outcome,
xlab = "observed (units)",
ylab = "outcome (units)",
bg = "lightblue",
col = "black", cex = 1.1, pch = 21,frame = FALSE)
# calculate the coefficients of the linear model
fit <- lm(outcome ~ observed, data = fake)
#plot the regression line and the predicted points
abline(fit, lwd = 2)
points(fake$observed, predict(fit), pch = 19, col = "red")
# examine the coefficients and the generated model
fit
Call:
lm(formula = outcome ~ observed, data = fake)
Coefficients:
(Intercept) observed
14.529 2.018
coef(fit)
(Intercept) observed
14.528576 2.017873
summary(fit)
Call:
lm(formula = outcome ~ observed, data = fake)
Residuals:
Min 1Q Median 3Q Max
-18.365 -5.018 -1.018 4.001 28.201
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 14.5286 2.5597 5.676 7.81e-07 ***
observed 2.0179 0.0423 47.703 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.926 on 48 degrees of freedom
Multiple R-squared: 0.9793, Adjusted R-squared: 0.9789
F-statistic: 2276 on 1 and 48 DF, p-value: < 2.2e-16
# to show the following 6) plots
# 1) a plot of residuals against fitted values (should be uncorrelated with fitted (observed) values),
# 2) a Scale-Location plot of sqrt(| residuals |) against fitted values,
# 3) a Normal Q-Q plot (residuals (error terms) are assumed to follow a normal distribution for many tests),
# 4) a plot of Cook's distances versus row labels,
# 5) a plot of residuals against leverages, and
# 6) a plot of Cook's distances against leverage/(1-leverage).
# see methods(plot) then ?plot.lm for more
plot(fit, which = c(1:6))
#the I notation is a shortcut to allow evaluation of variable in-line with lm call
#fit2 <- lm(outcome ~ I(observed - mean(observed)), data = fake)
observed.c <- observed - mean(observed)
fit2 <- lm(outcome ~ observed.c, data = fake)
coef(fit2)
## (Intercept) observed.c
## 120.756980 2.017873
plot(observed.c, fake$outcome,
xlab = "observed (units)",
ylab = "outcome (units)",
bg = "lightblue",
col = "black", cex = 1.1, pch = 21,frame = FALSE)
#plot the regression line and the predicted points
abline(fit2, lwd = 2)
points(observed.c, predict(fit2), pch = 19, col = "red")
# examine the coefficients and the generated model
coef(fit2)
## (Intercept) observed.c
## 120.756980 2.017873
fit2
##
## Call:
## lm(formula = outcome ~ observed.c, data = fake)
##
## Coefficients:
## (Intercept) observed.c
## 120.757 2.018
# x = observations for which we want to make predictions using the generated model
x <- c(10,50,90)
# x must be passed as a column named after the expected predictor variable in a df
predict(fit, newdata = data.frame(observed = x))
## 1 2 3
## 34.7073 115.4222 196.1371
https://www.rstudio.com/wp-content/uploads/2015/03/ggplot2-cheatsheet.pdf
g <- ggplot(fake, aes(x=observed, y=outcome))
g <- g + xlab("observed (units)")
g <- g + ylab("outcome (units)")
g <- g + geom_point(size = 2, colour = "black", alpha = 0.4)
g <- g + geom_point(size = 1, colour = "blue", alpha = 0.2)
g <- g + geom_smooth(method="lm", colour="black")
g
https://plot.ly/ggplot2/geom_abline/
library(plotly)
p <- ggplotly(g)
p
e <- resid(fit)
#or
fit$residuals
## 1 2 3 4 5 6
## -6.6798441 7.2279911 15.8020713 -18.3647334 9.5081587 9.9378445
## 7 8 9 10 11 12
## -0.5682171 -0.2427242 -0.9616028 -3.5152021 -0.5805001 -4.7086154
## 13 14 15 16 17 18
## -3.9904686 5.8825933 14.6598285 2.7350679 -1.3301476 -4.1523455
## 19 20 21 22 23 24
## -3.1366371 28.2013977 5.4775004 -1.0740435 -0.7117815 8.3820214
## 25 26 27 28 29 30
## -2.3438252 -9.5300548 10.7560202 -5.6734718 4.4228302 -4.4693968
## 31 32 33 34 35 36
## 14.7739582 -0.4243626 -1.8663316 -0.3075739 -12.4551867 -7.0503589
## 37 38 39 40 41 42
## -17.9537706 -9.0140349 1.3927993 -0.7433233 19.0589422 -6.9733317
## 43 44 45 46 47 48
## -3.6740678 1.7955705 -5.1215069 -5.3418585 -6.9318270 -8.0677362
## 49 50
## -1.5201453 -0.5355678
#