library(babynames)
babynames %>% filter(year >= 1980 & sex == 'M')
# A tibble: 428,872 x 5 year sex name n prop <dbl> <chr> <chr> <int> <dbl> 1 1980 M Michael 68693 0.0370 2 1980 M Christopher 49092 0.0265 3 1980 M Jason 48173 0.0260 4 1980 M David 41923 0.0226 5 1980 M James 39325 0.0212 6 1980 M Matthew 37860 0.0204 7 1980 M Joshua 36060 0.0194 8 1980 M John 35279 0.0190 9 1980 M Robert 34281 0.0185 10 1980 M Joseph 30202 0.0163 # ... with 428,862 more rows
name_dist <- babynames %>% filter(year >= 1980 & sex == 'M') %>% count(name, wt=prop, sort=TRUE) %>% filter(n > .01) %>% mutate( Rank=row_number(), Index=rev(row_number()) )
ggplot(name_dist, aes(x=Index, y=n)) + geom_col() + geom_vline(xintercept=median(name_dist$Index), color='red')
l_names <- babynames %>% filter(year >= 1980 & sex == 'M' & str_detect(name, '^L|[EAIUO]l')) %>% count(name, sort=TRUE, wt=prop) %>% filter(n > .0001) %>% mutate(Rank=row_number(), Index=rev(row_number())) l_names
# A tibble: 587 x 4 name n Rank Index <chr> <dbl> <int> <int> 1 Alexander 0.274 1 587 2 Logan 0.152 2 586 3 Elijah 0.127 3 585 4 Luke 0.116 4 584 5 Lucas 0.112 5 583 6 Luis 0.102 6 582 7 Alex 0.0979 7 581 8 Liam 0.0935 8 580 9 Landon 0.0711 9 579 10 Levi 0.0599 10 578 # ... with 577 more rows
ggplot(l_names, aes(x=Index, y=n)) + geom_col() + geom_vline(aes(xintercept=Index, color=name), data=filter(l_names, name %in% c('Lev', 'Elliott')))
Lev
🍼💩😴
hatch <- read_csv( # use system grep to remove section rows pipe('grep -v === data/hatchbaby_20191026.csv'), # specify column types col_types=list( `Start Time`=col_datetime(format='%m/%d/%Y %I:%M %p') , `End Time`=col_datetime(format='%m/%d/%Y %I:%M %p') , Info=col_character() , Percentile=col_character() ) ) %>% filter(Activity %in% c('SLEEP', 'FEEDING')) %>% mutate_at(vars(`Start Time`, `End Time`), ~force_tz(time=., tzone='America/New_York'))
hatch <- hatch %>% duplicates(index=`Start Time`, key=Activity) %>% group_by(`Start Time`, Activity) %>% filter(Duration == max(Duration) | Activity != 'FEEDING') %>% filter(Info != 'Both' | Activity != ' DIAPER') %>% ungroup() %>% distinct(`Start Time`, Activity, .keep_all=TRUE) %>% select(`Start Time`, Activity, Duration) %>% anti_join(x=hatch, y=., by=c('Start Time'='Start Time', 'Activity'='Activity')) %>% as_tsibble(key=Activity, index=`Start Time`, regular=TRUE) %>% mutate_at( .vars=vars(`End Time`, `Start Time`), ~if_else( `Start Time` >= '2019-08-05' & `Start Time` <= '2019-08-11', .x - 9.5*60*60, .x ) ) %>% filter_index('2019-04-03' ~ '2019-10-25')
active <- hatch %>% select(`Start Time`, `End Time`, Activity) %>% mutate(Active=TRUE) %>% fill_gaps() %>% fill(`End Time`, .direction='down') %>% mutate( Active=`Start Time` <= `End Time`, Time=factor(str_extract(`Start Time`, '\\d{2}:\\d{2}')), Date=date(`Start Time`) )
active %>% filter(Activity == 'SLEEP') %>% ggplot(aes(x=Date, y=forcats::fct_rev(Time), fill=Active)) + geom_tile() + scale_fill_manual(values=c('#F5F5DC', '#39B4ED')) + scale_y_discrete(breaks=c('04:00', '08:00', '12:00', '16:00', '20:00')) + theme_minimal() + labs(x=NULL, y=NULL) + theme(panel.background=element_blank(), panel.grid=element_blank(), legend.position='none', axis.ticks=element_blank() )
active_wide <- active %>% select(Date, Time, Activity, Active, `Start Time`) %>% pivot_wider( id_cols=c(`Start Time`, Date, Time), names_from=c(Activity), values_from=Active) %>% replace_na(replace=list(FEEDING=FALSE, SLEEP=FALSE)) %>% mutate( Activity=case_when(FEEDING ~ 'Feeding', SLEEP ~ 'Sleeping', TRUE ~ 'Other') ) ggplot(active_wide, aes(x=Date, y=forcats::fct_rev(Time), fill=Activity)) + geom_tile() + scale_fill_manual(values=c(Other='#F5F5DC', Sleeping='#39B4ED', Feeding='green')) + scale_y_discrete(breaks=c('04:00', '08:00', '12:00', '16:00', '20:00')) + theme_minimal() + labs(x=NULL, y=NULL) + theme(panel.background=element_blank(), panel.grid=element_blank(), legend.position='none', axis.ticks=element_blank() )
daily <- hatch %>% as_tibble() %>% mutate(Duration=as.numeric(`End Time` - `Start Time`)/60, Marker=1) %>% select(`Start Time`, Activity, Amount, Duration, Marker) %>% right_join(active %>% as_tibble(), by=c('Start Time'='Start Time', 'Activity'='Activity')) %>% fill(Amount, .direction='down') %>% replace_na(replace=list(Marker=0, Duration=0)) %>% as_tsibble(index=`Start Time`, key=Activity) %>% group_by_key() %>% index_by(Date=~as.Date(.)) %>% summarize(Duration=sum(Duration), Count=sum(Marker)) %>% mutate(Avg=Duration/Count) daily_wide <- daily %>% pivot_wider(id_cols=Date, names_from=Activity, values_from=c(Duration, Count, Avg)) %>% as_tsibble(index=Date) %>% rename(feeding_time=Duration_FEEDING, feeding_num=Count_FEEDING, sleep_time=Duration_SLEEP, sleep_num=Count_SLEEP, feeding_avg=Avg_FEEDING, sleep_avg=Avg_SLEEP)
age <- daily %>% as_tibble() %>% filter(day(Date) == 29) %>% select(Date) %>% distinct() %>% mutate(Age=as.numeric(Date - ymd('20190329')) %/% 30)
daily %>% autoplot(Count) + labs(title='Number of Naps vs Feedings') + geom_vline(aes(xintercept=Date), data=age, linetype=2, color='grey50') + theme_classic()
daily %>% autoplot(Duration) + labs(title='Time Spent (Min) of Napping vs Eating') + geom_vline(aes(xintercept=Date), data=age, linetype=2, color='grey50') + theme_classic()
daily %>% autoplot(Avg) + labs(title='Average Time (Min) Napping vs Eating') + geom_vline(aes(xintercept=Date), data=age, linetype=2, color='grey50') + theme_classic()
train <- daily_wide %>% filter_index(. ~ '2019-09') test <- daily_wide %>% filter_index('2019-10')
sleeper <- train %>% model( mean=MEAN(sleep_avg), naive=NAIVE(sleep_avg), snaive=SNAIVE(sleep_avg), drift=RW(sleep_avg ~ drift()), ets=ETS(sleep_avg), arima=ARIMA(sleep_avg), dyn=ARIMA(sleep_avg ~ feeding_avg) )
sleeper %>% forecast(test) %>% accuracy(test) %>% arrange(RMSE)
# A tibble: 7 x 9 .model .type ME RMSE MAE MPE MAPE MASE ACF1 <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> 1 dyn Test 28.4 92.1 75.0 -8.69 43.4 NaN -0.327 2 arima Test 29.3 92.3 75.0 -8.10 43.1 NaN -0.327 3 mean Test 47.6 99.0 78.7 2.91 41.1 NaN -0.321 4 snaive Test 38.9 99.8 84.1 -2.99 46.8 NaN -0.344 5 ets Test 49.9 101. 79.6 4.64 40.5 NaN -0.266 6 drift Test 82.8 120. 98.9 23.8 45.0 NaN -0.324 7 naive Test 85.9 122. 101. 25.7 45.3 NaN -0.321
sleeper %>% select(ets) %>% report()
Series: sleep_avg Model: ETS(M,A,N) Smoothing parameters: alpha = 0.08529311 beta = 0.005602186 Initial states: l b 149.6092 -1.155963 sigma^2: 0.15 AIC AICc BIC 2433.047 2433.390 2449.040
sleeper %>% select(dyn) %>% report()
Series: sleep_avg Model: LM w/ ARIMA(1,1,1) errors Coefficients: ar1 ma1 feeding_avg -0.4001 -0.8533 -0.303 s.e. 0.0744 0.0476 0.763 sigma^2 estimated as 3917: log likelihood=-999.52 AIC=2007.03 AICc=2007.26 BIC=2019.81
sleeper %>% select(arima) %>% report()
Series: sleep_avg Model: ARIMA(1,1,1) Coefficients: ar1 ma1 -0.4036 -0.8490 s.e. 0.0736 0.0462 sigma^2 estimated as 3899: log likelihood=-999.6 AIC=2005.19 AICc=2005.33 BIC=2014.77
sleeper %>% select(ets, arima, dyn) %>% forecast(test) %>% autoplot(daily_wide) + facet_wrap(~ .model, ncol=1)
measures_long <- daily_wide %>% pivot_longer(cols=-Date, names_to='Measurement', values_to='y') %>% as_tsibble(index=Date, key=Measurement)
prophet_fable <- measures_long %>% model( prophet=fable.prophet::prophet(y ~ growth(changepoints=age$Date)) ) %>% mutate( Forecasts=purrr::map(prophet, ~ predict(.x$fit$model)), Changepoints=purrr::map( prophet, ~ .x$fit$model$changepoints[abs(.x$fit$model$params$delta) >= 0.01] ) )
prophet_fitted <- prophet_fable %>% mutate( plot_df=purrr::map2(.x=prophet, .y=Forecasts, ~ prophet:::df_for_plotting(.x$fit$model, .y), .id='ID') ) %>% as_tibble() %>% select(Measurement, plot_df) %>% unnest(plot_df) changes <- prophet_fable %>% as_tibble() %>% select(Measurement, Changes=Changepoints) %>% unnest(Changes)
ggplot(prophet_fitted, aes(x=ds, y=y)) + geom_point(shape=1, size=1) + geom_line(aes(y=yhat), color="#0072B2") + geom_line(aes(ds, trend), color='red') + geom_vline(aes(xintercept=as_datetime(Date)), data=age, linetype=3, color='blue') + geom_vline(aes(xintercept=Changes), data=changes, color='red', linetype=2, size=1) + facet_wrap(~ Measurement, scales='free_y') + theme(panel.grid.major=element_blank())
feeding_type <- hatch %>% filter(Activity == 'FEEDING') %>% mutate(Duration=`End Time` - `Start Time`) %>% select(`Start Time`, Amount, Duration) %>% mutate(Type=if_else(Amount == '0.00oz', 'Breast', 'Formula')) %>% mutate(Amount=as.numeric(str_replace(Amount, 'oz', ''))) %>% group_by(Type) %>% index_by(Date=as.Date(`Start Time`)) %>% summarize(Num=n(), Time=sum(Duration)) %>% fill_gaps(Num=0)
feeding_type %>% autoplot(Num) + geom_vline(aes(xintercept=Date), data=age, linetype=2, color='grey50') + theme_classic() + theme(legend.position='bottom')
feeding_type %>% model(arima=ARIMA(Num)) %>% augment() %>% autoplot(.fitted) + geom_vline(aes(xintercept=Date), data=age, linetype=3, color='blue') + theme_classic() + theme(legend.position='bottom')
nap_length <- hatch %>% filter(Activity == 'SLEEP') %>% filter_index('2019-06' ~ .) %>% select(`Start Time`, `End Time`) %>% mutate(Duration=as.numeric(`End Time` - `Start Time`)/60) %>% mutate(Date=as.Date(`Start Time`)) %>% filter(hour(`Start Time`) >= 7 & hour(`Start Time`) < 18) %>% group_by(Date) %>% mutate(NapNum=factor(1:n())) %>% ungroup() %>% filter(NapNum <= 3) %>% as_tsibble(key=NapNum, index=Date) %>% fill_gaps()
nap_length %>% autoplot(Duration) + facet_wrap( ~ NapNum, ncol=1)
nap_length %>% group_by_key() %>% index_by(Month=~yearmonth(.)) %>% summarize(Avg=mean(Duration, na.rm=TRUE)) %>% autoplot(Avg) + facet_wrap(~NapNum, ncol=1)
nap_length %>% group_by_key() %>% index_by(1) %>% summarize(Avg=mean(Duration, na.rm=TRUE), SD=sd(Duration, na.rm=TRUE)) %>% ggplot(aes(x=NapNum, y=Avg)) + geom_col(fill='lightblue') + geom_errorbar(aes(ymin=Avg-SD, ymax=Avg+SD), width=0.5, size=1.2) + geom_quasirandom(aes(y=Duration), data=nap_length, size=1, shape=1)
Thank You