Intro to R and Regularization

Jared P. Lander

July 8th, 2013

A Little About

John Chambers

John Chambers * Bell Labs * 1976

New Zealand

Robert Gentleman & Ross Ihaka * University of Auckland, New Zealand * 1993

R

R Community

For Statisticians

Vectors

x <- 1:10
x
##  [1]  1  2  3  4  5  6  7  8  9 10

y <- seq(from = 1, to = 20, by = 2)
y
##  [1]  1  3  5  7  9 11 13 15 17 19

Vector Addition

x + 1
##  [1]  2  3  4  5  6  7  8  9 10 11
x + y
##  [1]  2  5  8 11 14 17 20 23 26 29

Vector Multiplication

x * 2
##  [1]  2  4  6  8 10 12 14 16 18 20

x * y
##  [1]   1   6  15  28  45  66  91 120 153 190

x %*% y
##      [,1]
## [1,]  715

sum(x * y)
## [1] 715

Matrices

A <- matrix(1:15, ncol = 3, byrow = TRUE)
A
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
## [3,]    7    8    9
## [4,]   10   11   12
## [5,]   13   14   15

B <- A
B
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
## [3,]    7    8    9
## [4,]   10   11   12
## [5,]   13   14   15

Matrix Addition

A + 1
##      [,1] [,2] [,3]
## [1,]    2    3    4
## [2,]    5    6    7
## [3,]    8    9   10
## [4,]   11   12   13
## [5,]   14   15   16

A + B
##      [,1] [,2] [,3]
## [1,]    2    4    6
## [2,]    8   10   12
## [3,]   14   16   18
## [4,]   20   22   24
## [5,]   26   28   30

Matrix Multiplication

A * 2
##      [,1] [,2] [,3]
## [1,]    2    4    6
## [2,]    8   10   12
## [3,]   14   16   18
## [4,]   20   22   24
## [5,]   26   28   30

A * B
##      [,1] [,2] [,3]
## [1,]    1    4    9
## [2,]   16   25   36
## [3,]   49   64   81
## [4,]  100  121  144
## [5,]  169  196  225

Dot Product

A %*% t(B)
##      [,1] [,2] [,3] [,4] [,5]
## [1,]   14   32   50   68   86
## [2,]   32   77  122  167  212
## [3,]   50  122  194  266  338
## [4,]   68  167  266  365  464
## [5,]   86  212  338  464  590

Basic Statistics

mean(x)
## [1] 5.5

sum(x)/length(x)
## [1] 5.5

mean(A)
## [1] 8

Diamonds

require(ggplot2)
head(diamonds)
##   carat       cut color clarity depth table price    x    y    z
## 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
## 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
## 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
## 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48

Histogram

ggplot(diamonds, aes(x = carat)) + geom_histogram()

Density

ggplot(diamonds, aes(x = price)) + geom_density(fill = "grey", color = "grey")

Scatterplot

ggplot(diamonds, aes(x = carat, y = price, color = color)) + geom_point()

Simple Model

mod1 <- lm(log(price) ~ log(carat) + cut + color + clarity, data = diamonds)
summary(mod1)
## 
## Call:
## lm(formula = log(price) ~ log(carat) + cut + color + clarity, 
##     data = diamonds)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0111 -0.0864 -0.0002  0.0834  1.9478 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.45703    0.00117 7242.23  < 2e-16 ***
## log(carat)   1.88372    0.00113 1668.75  < 2e-16 ***
## cut.L        0.12071    0.00235   51.28  < 2e-16 ***
## cut.Q       -0.03511    0.00207  -16.95  < 2e-16 ***
## cut.C        0.01348    0.00180    7.49  6.8e-14 ***
## cut^4       -0.00156    0.00144   -1.08    0.278    
## color.L     -0.43958    0.00203 -216.83  < 2e-16 ***
## color.Q     -0.09562    0.00186  -51.34  < 2e-16 ***
## color.C     -0.01478    0.00174   -8.48  < 2e-16 ***
## color^4      0.01185    0.00160    7.40  1.3e-13 ***
## color^5     -0.00220    0.00151   -1.46    0.146    
## color^6      0.00239    0.00138    1.74    0.082 .  
## clarity.L    0.91683    0.00358  256.27  < 2e-16 ***
## clarity.Q   -0.24304    0.00333  -72.98  < 2e-16 ***
## clarity.C    0.13240    0.00285   46.39  < 2e-16 ***
## clarity^4   -0.06610    0.00228  -28.96  < 2e-16 ***
## clarity^5    0.02742    0.00186   14.71  < 2e-16 ***
## clarity^6   -0.00182    0.00162   -1.12    0.263    
## clarity^7    0.03353    0.00143   23.41  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.134 on 53921 degrees of freedom
## Multiple R-squared:  0.983,  Adjusted R-squared:  0.983 
## F-statistic: 1.69e+05 on 18 and 53921 DF,  p-value: <2e-16

Coefficient Plot

require(coefplot)
coefplot(mod1, intercept = FALSE)

Modern Problems

Curse of Dimensionality

(Or Blessing)

High Dimensional Data

The Stanford Guys

Elastic Net

Elastic Net



\min_{\beta_{0},\beta \in \mathbb{R}^{p+1}} \left[ \frac{1}{2N} \sum_{i=1}^N \left( y_i - \beta_0 -x_i^T\beta \right)^2 + \lambda P_{\alpha} \left(\beta \right) \right]
where


P_{\alpha} \left(\beta \right) = \left(1 - \alpha \right) \frac{1}{2}||\Gamma\beta||_{\mathit{l}_2}^2 + \alpha ||\Gamma\beta||_{\mathit{l}_1}

When to Use the Elastic Net

In R

glmnet

Types of Models

glmnet in Action

require(ElemStatLearn)
dim(spam)
## [1] 4601   58

require(useful)
topright(spam)
##    A.54  A.55 A.56 A.57 spam
## 1 0.000 3.756   61  278 spam
## 2 0.048 5.114  101 1028 spam
## 3 0.010 9.821  485 2259 spam
## 4 0.000 3.537   40  191 spam
## 5 0.000 3.537   40  191 spam
bottomright(spam)
##      A.54  A.55 A.56 A.57  spam
## 4597    0 1.142    3   88 email
## 4598    0 1.555    4   14 email
## 4599    0 1.404    6  118 email
## 4600    0 1.147    5   78 email
## 4601    0 1.250    5   40 email

Needs Matrices

spamX <- build.x(spam ~ . - 1, data = spam)
spamY <- build.y(spam ~ . - 1, data = spam)

class(spamX)
## [1] "matrix"
class(spamY)
## [1] "factor"

corner(spamX)
##    A.1  A.2  A.3 A.4  A.5
## 1 0.00 0.64 0.64   0 0.32
## 2 0.21 0.28 0.50   0 0.14
## 3 0.06 0.00 0.71   0 1.23
## 4 0.00 0.00 0.00   0 0.63
## 5 0.00 0.00 0.00   0 0.63
head(spamY)
## [1] spam spam spam spam spam spam
## Levels: email spam

Fit Model

require(glmnet)

mod2 <- glmnet(x = spamX, y = spamY, family = "binomial")

The Coefficients

class(mod2$beta)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
dim(mod2$beta)
## [1] 57 91
corner(as.matrix(mod2$beta), c = 10)
##     s0 s1 s2 s3 s4 s5 s6 s7      s8      s9
## A.1  0  0  0  0  0  0  0  0 0.00000 0.00000
## A.2  0  0  0  0  0  0  0  0 0.00000 0.00000
## A.3  0  0  0  0  0  0  0  0 0.00000 0.00000
## A.4  0  0  0  0  0  0  0  0 0.00000 0.00000
## A.5  0  0  0  0  0  0  0  0 0.01465 0.04532

Coefficient Path

plot(mod2)

Cross Validation

Choose Optimal \lambda

mod3 <- cv.glmnet(x = spamX, y = spamY, family = "binomial", nfold = 5)

Cross Validation Result

plot(mod3)


# Minimum Error
mod3$lambda.min
## [1] 0.0004428
mod3$lambda.1se
## [1] 0.001788

More Cross Validation

Choose best \alpha through another layer of cross validation

glmnet Benefits

glmnet Drawbacks

Conclusions

About Me

Jared P. Lander

Owner of JP Lander Consulting

Adjunct Professor at Columbia University

Organizer of New York Open Statistical Programming Meetup

Author of R for Everyone (September 2013)

The Tools