Jared P. Lander
July 8th, 2013
x <- 1:10
x
## [1] 1 2 3 4 5 6 7 8 9 10
y <- seq(from = 1, to = 20, by = 2)
y
## [1] 1 3 5 7 9 11 13 15 17 19
x * 2
## [1] 2 4 6 8 10 12 14 16 18 20
x * y
## [1] 1 6 15 28 45 66 91 120 153 190
x %*% y
## [,1]
## [1,] 715
sum(x * y)
## [1] 715
A <- matrix(1:15, ncol = 3, byrow = TRUE)
A
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
## [3,] 7 8 9
## [4,] 10 11 12
## [5,] 13 14 15
B <- A
B
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
## [3,] 7 8 9
## [4,] 10 11 12
## [5,] 13 14 15
A + 1
## [,1] [,2] [,3]
## [1,] 2 3 4
## [2,] 5 6 7
## [3,] 8 9 10
## [4,] 11 12 13
## [5,] 14 15 16
A + B
## [,1] [,2] [,3]
## [1,] 2 4 6
## [2,] 8 10 12
## [3,] 14 16 18
## [4,] 20 22 24
## [5,] 26 28 30
A * 2
## [,1] [,2] [,3]
## [1,] 2 4 6
## [2,] 8 10 12
## [3,] 14 16 18
## [4,] 20 22 24
## [5,] 26 28 30
A * B
## [,1] [,2] [,3]
## [1,] 1 4 9
## [2,] 16 25 36
## [3,] 49 64 81
## [4,] 100 121 144
## [5,] 169 196 225
A %*% t(B)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 14 32 50 68 86
## [2,] 32 77 122 167 212
## [3,] 50 122 194 266 338
## [4,] 68 167 266 365 464
## [5,] 86 212 338 464 590
require(ggplot2)
head(diamonds)
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
ggplot(diamonds, aes(x = carat)) + geom_histogram()
ggplot(diamonds, aes(x = price)) + geom_density(fill = "grey", color = "grey")
ggplot(diamonds, aes(x = carat, y = price, color = color)) + geom_point()
mod1 <- lm(log(price) ~ log(carat) + cut + color + clarity, data = diamonds)
summary(mod1)
##
## Call:
## lm(formula = log(price) ~ log(carat) + cut + color + clarity,
## data = diamonds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0111 -0.0864 -0.0002 0.0834 1.9478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.45703 0.00117 7242.23 < 2e-16 ***
## log(carat) 1.88372 0.00113 1668.75 < 2e-16 ***
## cut.L 0.12071 0.00235 51.28 < 2e-16 ***
## cut.Q -0.03511 0.00207 -16.95 < 2e-16 ***
## cut.C 0.01348 0.00180 7.49 6.8e-14 ***
## cut^4 -0.00156 0.00144 -1.08 0.278
## color.L -0.43958 0.00203 -216.83 < 2e-16 ***
## color.Q -0.09562 0.00186 -51.34 < 2e-16 ***
## color.C -0.01478 0.00174 -8.48 < 2e-16 ***
## color^4 0.01185 0.00160 7.40 1.3e-13 ***
## color^5 -0.00220 0.00151 -1.46 0.146
## color^6 0.00239 0.00138 1.74 0.082 .
## clarity.L 0.91683 0.00358 256.27 < 2e-16 ***
## clarity.Q -0.24304 0.00333 -72.98 < 2e-16 ***
## clarity.C 0.13240 0.00285 46.39 < 2e-16 ***
## clarity^4 -0.06610 0.00228 -28.96 < 2e-16 ***
## clarity^5 0.02742 0.00186 14.71 < 2e-16 ***
## clarity^6 -0.00182 0.00162 -1.12 0.263
## clarity^7 0.03353 0.00143 23.41 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.134 on 53921 degrees of freedom
## Multiple R-squared: 0.983, Adjusted R-squared: 0.983
## F-statistic: 1.69e+05 on 18 and 53921 DF, p-value: <2e-16
require(coefplot)
coefplot(mod1, intercept = FALSE)
glmnet
require(ElemStatLearn)
dim(spam)
## [1] 4601 58
require(useful)
topright(spam)
## A.54 A.55 A.56 A.57 spam
## 1 0.000 3.756 61 278 spam
## 2 0.048 5.114 101 1028 spam
## 3 0.010 9.821 485 2259 spam
## 4 0.000 3.537 40 191 spam
## 5 0.000 3.537 40 191 spam
bottomright(spam)
## A.54 A.55 A.56 A.57 spam
## 4597 0 1.142 3 88 email
## 4598 0 1.555 4 14 email
## 4599 0 1.404 6 118 email
## 4600 0 1.147 5 78 email
## 4601 0 1.250 5 40 email
spamX <- build.x(spam ~ . - 1, data = spam)
spamY <- build.y(spam ~ . - 1, data = spam)
class(spamX)
## [1] "matrix"
class(spamY)
## [1] "factor"
corner(spamX)
## A.1 A.2 A.3 A.4 A.5
## 1 0.00 0.64 0.64 0 0.32
## 2 0.21 0.28 0.50 0 0.14
## 3 0.06 0.00 0.71 0 1.23
## 4 0.00 0.00 0.00 0 0.63
## 5 0.00 0.00 0.00 0 0.63
head(spamY)
## [1] spam spam spam spam spam spam
## Levels: email spam
require(glmnet)
mod2 <- glmnet(x = spamX, y = spamY, family = "binomial")
class(mod2$beta)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
dim(mod2$beta)
## [1] 57 91
corner(as.matrix(mod2$beta), c = 10)
## s0 s1 s2 s3 s4 s5 s6 s7 s8 s9
## A.1 0 0 0 0 0 0 0 0 0.00000 0.00000
## A.2 0 0 0 0 0 0 0 0 0.00000 0.00000
## A.3 0 0 0 0 0 0 0 0 0.00000 0.00000
## A.4 0 0 0 0 0 0 0 0 0.00000 0.00000
## A.5 0 0 0 0 0 0 0 0 0.01465 0.04532
plot(mod2)
Choose Optimal
mod3 <- cv.glmnet(x = spamX, y = spamY, family = "binomial", nfold = 5)
plot(mod3)
# Minimum Error
mod3$lambda.min
## [1] 0.0004428
mod3$lambda.1se
## [1] 0.001788
Choose best through another layer of cross validation
Owner of JP Lander Consulting
Adjunct Professor at Columbia University
Organizer of New York Open Statistical Programming Meetup
Author of R for Everyone (September 2013)