What’s in a Name?

babynames %>% 
    filter(year >= 1980 & sex == 'M')
# A tibble: 428,872 x 5
    year sex   name            n   prop
   <dbl> <chr> <chr>       <int>  <dbl>
 1  1980 M     Michael     68693 0.0370
 2  1980 M     Christopher 49092 0.0265
 3  1980 M     Jason       48173 0.0260
 4  1980 M     David       41923 0.0226
 5  1980 M     James       39325 0.0212
 6  1980 M     Matthew     37860 0.0204
 7  1980 M     Joshua      36060 0.0194
 8  1980 M     John        35279 0.0190
 9  1980 M     Robert      34281 0.0185
10  1980 M     Joseph      30202 0.0163
# ... with 428,862 more rows

name_dist <- babynames %>% 
    filter(year >= 1980 & sex == 'M') %>% 
    count(name, wt=prop, sort=TRUE) %>% 
    filter(n > .01) %>% 
    mutate(
        Rank=row_number(),
        Index=rev(row_number())
    )

ggplot(name_dist, aes(x=Index, y=n)) + 
    geom_col() + 
    geom_vline(xintercept=median(name_dist$Index), color='red')

l_names <- babynames %>% 
    filter(year >= 1980 & sex == 'M' & str_detect(name, '^L|[EAIUO]l')) %>% 
    count(name, sort=TRUE, wt=prop) %>% 
    filter(n > .0001) %>% 
    mutate(Rank=row_number(), Index=rev(row_number()))

l_names
# A tibble: 587 x 4
   name           n  Rank Index
   <chr>      <dbl> <int> <int>
 1 Alexander 0.274      1   587
 2 Logan     0.152      2   586
 3 Elijah    0.127      3   585
 4 Luke      0.116      4   584
 5 Lucas     0.112      5   583
 6 Luis      0.102      6   582
 7 Alex      0.0979     7   581
 8 Liam      0.0935     8   580
 9 Landon    0.0711     9   579
10 Levi      0.0599    10   578
# ... with 577 more rows

  • Elliott
  • Lev

ggplot(l_names, aes(x=Index, y=n)) + 
    geom_col() + 
    geom_vline(aes(xintercept=Index, color=name), 
               data=filter(l_names, name %in% c('Lev', 'Elliott')))

Lev

Baby Arrives

🍼💩😴

hatch <- read_csv(
  # use system grep to remove section rows
  pipe('grep -v === data/hatchbaby_20191026.csv'),
  # specify column types
  col_types=list(
    `Start Time`=col_datetime(format='%m/%d/%Y %I:%M %p')
    , `End Time`=col_datetime(format='%m/%d/%Y %I:%M %p')
    , Info=col_character()
    , Percentile=col_character()
  )
) %>% 
  filter(Activity %in% c('SLEEP', 'FEEDING')) %>% 
  mutate_at(vars(`Start Time`, `End Time`), ~force_tz(time=., tzone='America/New_York'))

hatch <- hatch %>% 
    duplicates(index=`Start Time`, key=Activity) %>% 
    group_by(`Start Time`, Activity) %>% 
    filter(Duration == max(Duration) | Activity != 'FEEDING') %>% 
    filter(Info != 'Both' | Activity != ' DIAPER') %>% 
    ungroup() %>% 
    distinct(`Start Time`, Activity, .keep_all=TRUE) %>% 
    select(`Start Time`, Activity, Duration) %>% 
    anti_join(x=hatch, y=., by=c('Start Time'='Start Time', 'Activity'='Activity')) %>% 
    as_tsibble(key=Activity, index=`Start Time`, regular=TRUE) %>% 
    mutate_at(
        .vars=vars(`End Time`, `Start Time`),
        ~if_else(
            `Start Time` >= '2019-08-05' & `Start Time` <= '2019-08-11',
            .x - 9.5*60*60,
            .x
        )
    ) %>% 
  filter_index('2019-04-03' ~ '2019-10-25')

active <- hatch %>% 
  select(`Start Time`, `End Time`, Activity) %>% 
  mutate(Active=TRUE) %>% 
  fill_gaps() %>% 
  fill(`End Time`, .direction='down') %>% 
  mutate(
    Active=`Start Time` <= `End Time`,
    Time=factor(str_extract(`Start Time`, '\\d{2}:\\d{2}')),
    Date=date(`Start Time`)
  )

active %>% 
  filter(Activity == 'SLEEP') %>% 
  ggplot(aes(x=Date, y=forcats::fct_rev(Time), fill=Active)) + 
  geom_tile() + 
  scale_fill_manual(values=c('#F5F5DC', '#39B4ED')) + 
  scale_y_discrete(breaks=c('04:00', '08:00', '12:00', '16:00', '20:00')) +
  theme_minimal() + 
  labs(x=NULL, y=NULL) + 
  theme(panel.background=element_blank(), panel.grid=element_blank(),
        legend.position='none',
        axis.ticks=element_blank()
  )

active_wide <- active %>% 
  select(Date, Time, Activity, Active, `Start Time`) %>% 
  pivot_wider(
    id_cols=c(`Start Time`, Date, Time), 
    names_from=c(Activity), values_from=Active) %>% 
  replace_na(replace=list(FEEDING=FALSE, SLEEP=FALSE)) %>% 
  mutate(
    Activity=case_when(FEEDING ~ 'Feeding', SLEEP ~ 'Sleeping', TRUE ~ 'Other')
  )

ggplot(active_wide, aes(x=Date, y=forcats::fct_rev(Time), fill=Activity)) + 
  geom_tile() + 
  scale_fill_manual(values=c(Other='#F5F5DC', Sleeping='#39B4ED', Feeding='green')) +
  scale_y_discrete(breaks=c('04:00', '08:00', '12:00', '16:00', '20:00')) + 
  theme_minimal() + 
  labs(x=NULL, y=NULL) + 
  theme(panel.background=element_blank(), panel.grid=element_blank(),
        legend.position='none',
        axis.ticks=element_blank()
  )

Sleep and Eating

daily <- hatch %>% as_tibble() %>% 
  mutate(Duration=as.numeric(`End Time` - `Start Time`)/60, Marker=1) %>% 
  select(`Start Time`, Activity, Amount, Duration, Marker) %>% 
  right_join(active %>% as_tibble(), 
             by=c('Start Time'='Start Time', 'Activity'='Activity')) %>% 
  fill(Amount, .direction='down') %>% 
  replace_na(replace=list(Marker=0, Duration=0)) %>% 
  as_tsibble(index=`Start Time`, key=Activity) %>% 
  group_by_key() %>% 
  index_by(Date=~as.Date(.)) %>% 
  summarize(Duration=sum(Duration), Count=sum(Marker)) %>% 
  mutate(Avg=Duration/Count)

daily_wide <- daily %>% 
    pivot_wider(id_cols=Date, 
                names_from=Activity, values_from=c(Duration, Count, Avg)) %>% 
    as_tsibble(index=Date) %>% 
    rename(feeding_time=Duration_FEEDING, feeding_num=Count_FEEDING,
           sleep_time=Duration_SLEEP, sleep_num=Count_SLEEP, 
           feeding_avg=Avg_FEEDING, sleep_avg=Avg_SLEEP)

age <- daily %>% 
  as_tibble() %>% 
  filter(day(Date) == 29) %>% 
  select(Date) %>% 
  distinct() %>% 
  mutate(Age=as.numeric(Date - ymd('20190329')) %/% 30)

daily %>% 
  autoplot(Count) + 
  labs(title='Number of Naps vs Feedings') + 
  geom_vline(aes(xintercept=Date), data=age, linetype=2, color='grey50') + 
  theme_classic()

daily %>% 
  autoplot(Duration) + 
  labs(title='Time Spent (Min) of Napping vs Eating') +
  geom_vline(aes(xintercept=Date), data=age, linetype=2, color='grey50') + 
  theme_classic()

daily %>% 
  autoplot(Avg) + 
  labs(title='Average Time (Min) Napping vs Eating') + 
  geom_vline(aes(xintercept=Date), data=age, linetype=2, color='grey50') + 
  theme_classic()

train <- daily_wide %>% filter_index(. ~ '2019-09')
test <- daily_wide %>% filter_index('2019-10')

sleeper <- train %>% 
    model(
        mean=MEAN(sleep_avg),
        naive=NAIVE(sleep_avg),
        snaive=SNAIVE(sleep_avg),
        drift=RW(sleep_avg ~ drift()),
        ets=ETS(sleep_avg),
        arima=ARIMA(sleep_avg),
        dyn=ARIMA(sleep_avg ~ feeding_avg)
    )

sleeper %>% 
  forecast(test) %>% 
  accuracy(test) %>% 
  arrange(RMSE)
# A tibble: 7 x 9
  .model .type    ME  RMSE   MAE   MPE  MAPE  MASE   ACF1
  <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
1 dyn    Test   28.4  92.1  75.0 -8.69  43.4   NaN -0.327
2 arima  Test   29.3  92.3  75.0 -8.10  43.1   NaN -0.327
3 mean   Test   47.6  99.0  78.7  2.91  41.1   NaN -0.321
4 snaive Test   38.9  99.8  84.1 -2.99  46.8   NaN -0.344
5 ets    Test   49.9 101.   79.6  4.64  40.5   NaN -0.266
6 drift  Test   82.8 120.   98.9 23.8   45.0   NaN -0.324
7 naive  Test   85.9 122.  101.  25.7   45.3   NaN -0.321

sleeper %>% select(ets) %>% report()
Series: sleep_avg 
Model: ETS(M,A,N) 
  Smoothing parameters:
    alpha = 0.08529311 
    beta  = 0.005602186 

  Initial states:
        l         b
 149.6092 -1.155963

  sigma^2:  0.15

     AIC     AICc      BIC 
2433.047 2433.390 2449.040 

sleeper %>% select(dyn) %>% report()
Series: sleep_avg 
Model: LM w/ ARIMA(1,1,1) errors 

Coefficients:
          ar1      ma1  feeding_avg
      -0.4001  -0.8533       -0.303
s.e.   0.0744   0.0476        0.763

sigma^2 estimated as 3917:  log likelihood=-999.52
AIC=2007.03   AICc=2007.26   BIC=2019.81

sleeper %>% select(arima) %>% report()
Series: sleep_avg 
Model: ARIMA(1,1,1) 

Coefficients:
          ar1      ma1
      -0.4036  -0.8490
s.e.   0.0736   0.0462

sigma^2 estimated as 3899:  log likelihood=-999.6
AIC=2005.19   AICc=2005.33   BIC=2014.77

sleeper %>% select(ets, arima, dyn) %>% 
  forecast(test) %>% autoplot(daily_wide) + 
  facet_wrap(~ .model, ncol=1)

Regime Changes

measures_long <- daily_wide %>% 
  pivot_longer(cols=-Date, names_to='Measurement', values_to='y') %>% 
  as_tsibble(index=Date, key=Measurement)

prophet_fable <- measures_long %>% 
  model(
    prophet=fable.prophet::prophet(y ~ growth(changepoints=age$Date))
  ) %>% 
  mutate(
    Forecasts=purrr::map(prophet, ~ predict(.x$fit$model)),
    Changepoints=purrr::map(
      prophet, 
      ~ .x$fit$model$changepoints[abs(.x$fit$model$params$delta) >= 0.01]
    )
  )

prophet_fitted <- prophet_fable %>% 
  mutate(
    plot_df=purrr::map2(.x=prophet, .y=Forecasts, 
                        ~ prophet:::df_for_plotting(.x$fit$model, .y), .id='ID')
  ) %>% 
  as_tibble() %>% 
  select(Measurement, plot_df) %>% 
  unnest(plot_df)

changes <- prophet_fable %>% 
  as_tibble() %>% 
  select(Measurement, Changes=Changepoints) %>% 
  unnest(Changes)

ggplot(prophet_fitted, aes(x=ds, y=y)) + geom_point(shape=1, size=1) + 
  geom_line(aes(y=yhat), color="#0072B2") + geom_line(aes(ds, trend), color='red') +
  geom_vline(aes(xintercept=as_datetime(Date)), data=age, linetype=3, color='blue') +
  geom_vline(aes(xintercept=Changes), data=changes, color='red', linetype=2, size=1) +
  facet_wrap(~ Measurement, scales='free_y') + theme(panel.grid.major=element_blank())

Type of Milk

feeding_type <- hatch %>% 
  filter(Activity == 'FEEDING') %>% 
  mutate(Duration=`End Time` - `Start Time`) %>% 
  select(`Start Time`, Amount, Duration) %>% 
  mutate(Type=if_else(Amount == '0.00oz', 'Breast', 'Formula')) %>% 
  mutate(Amount=as.numeric(str_replace(Amount, 'oz', ''))) %>% 
  group_by(Type) %>% 
  index_by(Date=as.Date(`Start Time`)) %>% 
  summarize(Num=n(), Time=sum(Duration)) %>% 
  fill_gaps(Num=0)

feeding_type %>% 
  autoplot(Num) +
  geom_vline(aes(xintercept=Date), data=age, linetype=2, color='grey50') + 
  theme_classic() + theme(legend.position='bottom')

feeding_type %>% 
  model(arima=ARIMA(Num)) %>% augment() %>% autoplot(.fitted) + 
  geom_vline(aes(xintercept=Date), data=age, linetype=3, color='blue') + 
  theme_classic() + theme(legend.position='bottom')

Nap Length

nap_length <- hatch %>% 
  filter(Activity == 'SLEEP') %>% 
  filter_index('2019-06' ~ .) %>% 
  select(`Start Time`, `End Time`) %>% 
  mutate(Duration=as.numeric(`End Time` - `Start Time`)/60) %>% 
  mutate(Date=as.Date(`Start Time`)) %>% 
  filter(hour(`Start Time`) >= 7 & hour(`Start Time`) < 18) %>% 
  group_by(Date) %>% 
  mutate(NapNum=factor(1:n())) %>% 
  ungroup() %>% 
  filter(NapNum <= 3) %>% 
  as_tsibble(key=NapNum, index=Date) %>% 
  fill_gaps()

nap_length %>% autoplot(Duration) + facet_wrap( ~ NapNum, ncol=1)

nap_length %>% group_by_key() %>% 
  index_by(Month=~yearmonth(.)) %>% 
  summarize(Avg=mean(Duration, na.rm=TRUE)) %>% 
  autoplot(Avg) + facet_wrap(~NapNum, ncol=1)

nap_length %>% group_by_key() %>% index_by(1) %>% 
  summarize(Avg=mean(Duration, na.rm=TRUE), SD=sd(Duration, na.rm=TRUE)) %>% 
  ggplot(aes(x=NapNum, y=Avg)) + geom_col(fill='lightblue') +
  geom_errorbar(aes(ymin=Avg-SD, ymax=Avg+SD), width=0.5, size=1.2) + 
  geom_quasirandom(aes(y=Duration), data=nap_length, size=1, shape=1)

What Did We Learn?

  • Naming things is hard
  • Patterns change with each month
  • Sleeping better over time
  • Eating barely predicts sleep
  • Naps most in morning
  • Loves Pizza

Thank You