\[ \min_{\beta \in \mathbb{R}^{p+1}} \left[ \frac{1}{2N} \sum_{i=1}^N \left( y_i - x_i^T\beta \right)^2 + \lambda ||\beta||_{1} \right] \]
ames <- AmesHousing::make_ames() set.seed(4595) data_split <- initial_split(ames, strata='Sale_Price') ames_train <- training(data_split) ames_test <- testing(data_split)
theFormula <- Sale_Price ~ Longitude + Latitude + Lot_Frontage + Lot_Area + Street + Utilities + Bldg_Type + Overall_Qual + Year_Built + Foundation + Exter_Cond + Heating + Central_Air + First_Flr_SF + Second_Flr_SF + Full_Bath + Half_Bath + Kitchen_Qual + TotRms_AbvGrd + Garage_Type + Garage_Cars + Open_Porch_SF + Pool_Area + Fence
basic_rec <- recipe(theFormula, data = ames_train) %>% # collapse some levels into "other" step_other(all_nominal(), threshold = 0.01) %>% # remove variables with zero variance step_zv(all_predictors()) %>% # center and scale numeric variables step_center(all_numeric()) %>% step_scale(all_numeric()) %>% # turn categoricals into dummies, don't drop baseline step_dummy(all_nominal(), one_hot=TRUE) %>% # ensure there is an intercept step_intercept()
prepped <- basic_rec %>% prep(training=ames_train, retain=TRUE) prepped
Data Recipe Inputs: role #variables outcome 1 predictor 24 Training data contained 2199 data points and no missing data. Operations: Collapsing factor levels for Street, Utilities, Bldg_Type, ... [trained] Zero variance filter removed no terms [trained] Centering for Longitude, Latitude, ... [trained] Scaling for Longitude, Latitude, ... [trained] Dummy variables from Street, Utilities, Bldg_Type, ... [trained] Adding intercept
Many Ways to Compute
# train matrices x_train <- prepped %>% juice(all_predictors(), composition='dgCMatrix') y_train <- prepped %>% juice(all_outcomes() ,composition='matrix')
mod_glmnet <- glmnet(x=x_train, y=y_train, family='gaussian', alpha=1, intercept=FALSE, standardize=FALSE)
plot(mod_glmnet, xvar='lambda')
coefpath(mod_glmnet)
coefplot(mod_glmnet, sort='magnitude', lambda=0.0004)
# train matrices x_train <- prepped %>% juice(all_predictors(), composition='dgCMatrix') y_train <- prepped %>% juice(all_outcomes() ,composition='matrix')
xg_train <- xgb.DMatrix(data=x_train, label=y_train)
mod_xgboost <- xgb.train( data=xg_train, booster='gblinear', alpha=0.0004, objective='reg:linear', nrounds=20 )
coefplot(mod_xgboost, sort='magnitude')
coefplot(mod_xgboost, sort='magnitude', plot=FALSE) %>% filter(abs(Value) > 1E-3) %>% coefplot()
# train matrices x_train_dense <- prepped %>% juice(all_predictors(), composition='matrix') y_train_dense <- prepped %>% juice(all_outcomes() ,composition='matrix')
mod_lars <- lars(x=x_train_dense, y=y_train_dense, type='lasso', normalize=FALSE, intercept=FALSE)
coef(mod_lars, s=50) %>% as.data.frame() %>% setNames('Value') %>% tibble::rownames_to_column('Coefficient') %>% filter(Value != 0) %>% mutate(HighInner=NA_real_, LowInner=NA_real_, HighOuter=NA_real_, LowOuter=NA_real_, Model='model') %>% mutate(Coefficient=reorder(Coefficient, Value)) %>% coefplot()
stan_recipe <- recipe(theFormula, data = ames_train) %>% # collapse some levels into "other" step_other(all_nominal(), threshold = 0.01) %>% # remove variables with zero variance step_zv(all_predictors()) %>% # center and scale numeric variables step_center(all_numeric()) %>% step_scale(all_numeric()) %>% prep(training=ames_train, retain=TRUE) train_stan <- stan_recipe %>% juice(composition='tibble')
mod_stan <- stan_glm(theFormula, data=data_stan, family=gaussian, prior=lasso(), iter=1000)
mcmc_intervals(as.array(mod_stan))
gridExtra::grid.arrange( mcmc_areas(as.array(mod_stan), pars=c('Longitude', 'Latitude')), mcmc_areas(as.array(mod_stan), pars=stringr::str_subset(names(mod_stan$coefficients), 'Overall_Qual')), nrow=1)
\[ \min_{\beta \in \mathbb{R}^{p+1}} \left[ \frac{1}{2N} \sum_{i=1}^N \left( y_i - x_i^T\beta \right)^2 + \lambda ||\beta||_{1} \right] \]
\[ \min_{\beta \in \mathbb{R}^{p+1}} \left[ ||Y - X\beta||_2^2 + \lambda||\beta||_1 \right] \]
\[ \min_{\beta \in \mathbb{R}^{p+1}} ||Y - X\beta||_2^2 \\ \text{subject to } ||\beta||_1 \le t \]
\[ (Y - X\beta)^T(Y - X\beta) \\ (Y^T - \beta^TX^T)(Y - X\beta) \\ Y^TY - Y^TX\beta - \beta^TX^TY + \beta^TX^TX\beta \\ Y^TY - 2Y^TX\beta + \beta^TX^TX\beta \\ Y^TY+ \beta^TX^TX\beta - 2Y^TX\beta \]
\[ \min_{x} \frac{1}{2} x^TVV^Tx + c^Tx \\ Ax = b \\ l \preceq x \preceq u \]
\[ \min_{\beta^+, \beta^-} \left[(\beta^+ - \beta^-)XX^T(\beta^+ - \beta^-) - 2YX^T(\beta^+ - \beta^-) \right] \\ \text{subject to } \sum_{j=1}^p (\beta^+ - \beta^-) \le t \\ \beta^+ \succeq 0, \beta^- \succeq 0 \]
lasso.qp <- function(x, y, t) { y <- as.vector(y) p <- ncol(x) # get negative and positive side of X Vn <- x %*% cbind(diag(p), -diag(p), 0) # compute YX YX <- colSums(x * -y) Zn <- c (2*YX, -2*YX, 0) # get upper bound for coefficients mod <- lm.fit(x, y) # replace any NAs with the maximum of all coefs b_ols <- mod$coefficients %>% replace_na(max(mod$coefficients, na.rm=TRUE)) # create matrices and vectors u <- c(abs(b_ols), abs(b_ols), sum(abs(b_ols))) A <- matrix (c(rep (1, 2*p), 1), nrow=1) b <- c(min(t, sum(abs(b_ols)))) # solve soln <- LowRankQP::LowRankQP(Vmat=sqrt(2)*t(Vn), dvec=Zn, Amat=A, bvec=b, uvec=u, method="LU") return(round(soln$alpha[1:p] - soln$alpha[(p+1):(2*p)], digits=5)) }
# train matrices x_train_dense <- prepped %>% juice(all_predictors(), composition='matrix') y_train_dense <- prepped %>% juice(all_outcomes() ,composition='matrix')
mod_quad <- lasso.qp(x=x_train_dense, y=y_train_dense, t=5)
LowRankQP CONVERGED IN 25 ITERATIONS Primal Feasibility = 7.6784336e-11 Dual Feasibility = 8.8817842e-16 Complementarity Value = 4.2747336e-09 Duality Gap = 4.2702624e-09 Termination Condition = 2.3446174e-12
data_frame(Coefficient=colnames(x_train_dense), Value=mod_quad) %>% filter(Value != 0) %>% mutate(HighInner=NA_real_, LowInner=NA_real_, HighOuter=NA_real_, LowOuter=NA_real_, Model='model') %>% mutate(Coefficient=reorder(Coefficient, Value)) %>% coefplot()
# train matrices x_train_dense <- prepped %>% juice(all_predictors(), composition='matrix') y_train_dense <- prepped %>% juice(all_outcomes() ,composition='matrix')
mod_keras <- keras_model_sequential() %>% layer_dense(units = 1, activation = "linear", kernel_regularizer=regularizer_l1(l=0.02), input_shape = dim(x_train_dense)[2])
mod_keras
Model ________________________________________________________________________________ Layer (type) Output Shape Param # ================================================================================ dense_1 (Dense) (None, 1) 64 ================================================================================ Total params: 64 Trainable params: 64 Non-trainable params: 0 ________________________________________________________________________________
mod_keras %>% compile( loss = "mse", optimizer = optimizer_rmsprop(), metrics = list("mean_absolute_error") )
history <- mod_keras %>% fit( x=x_train_dense, y=y_train_dense, epochs = 30, batch_size=64, validation_split = 0.1, view_metrics=TRUE, callbacks=list( callback_early_stopping(monitor='val_acc', patience=10), callback_reduce_lr_on_plateau(monitor='val_loss', factor=0.1, patience=5), callback_tensorboard("logs/run_lasso") ) )
plot(history)
get_weights(mod_keras)[[1]] %>% as_data_frame() %>% rename(Value=V1) %>% mutate(Coefficient=colnames(x_train_dense)) %>% filter(abs(Value) > 1E-3) %>% mutate(HighInner=NA_real_, LowInner=NA_real_, HighOuter=NA_real_, LowOuter=NA_real_, Model='model') %>% mutate(Coefficient=reorder(Coefficient, Value)) %>% coefplot()
# train matrices x_test <- prepped %>% bake(all_predictors(), composition='matrix', newdata=ames_test) y_test <- prepped %>% bake(all_outcomes() ,composition='matrix', newdata=ames_test)
predictions <- list( glmnet=predict(mod_glmnet, newx=x_test, s=0.0004), xgboost=predict(mod_xgboost, newdata=x_test[, mod_xgboost$feature_names]), lars=predict(mod_lars, newx=x_test, s=50)$fit, stan=predict(mod_stan, newdata=stan_recipe %>% bake(newdata=ames_test, composition='tibble')), quad=x_test %*% mod_quad, keras=predict(mod_keras, x=x_test) )
predictions <- predictions %>% map(~data.frame(Pred=as.numeric(.))) %>% map_df(. %>% mutate(Actual=as.numeric(y_test)) %>% mutate(.Resid=Actual-Pred), .id='Model')
ggplot(predictions, aes(x=Actual, y=Pred, color=Model)) + geom_abline(color='grey50', linetype=2) + geom_point(alpha=1/2, shape=1, size=1) + geom_smooth() +facet_wrap(~Model) + theme(legend.position='none')
error_measures <- predictions %>% group_by(Model) %>% summarize(MSE=mean(.Resid^2)) %>% arrange(MSE) %>% mutate(Model=reorder(Model, MSE))
ggplot(error_measures, aes(x=Model, y=MSE)) + geom_col(width=0.5, fill='lightblue')
glmnet
xgboost
lars
stan
keras
[1] "AmesHousing" "bayesplot" "coefplot" "distr" [5] "DT" "ggplot2" "glmnet" "here" [9] "keras" "knitr" "lars" "leaflet" [13] "leaflet.extras" "LowRankQP" "magrittr" "purrr" [17] "recipes" "rsample" "rstanarm" "stringr" [21] "tibble" "useful" "xgboost"
Thank You