5,013 Members

Performance

Big Data Hurdles

Solutions

Buy a bigger machine

Sampling

Compute more intelligently

Alternative Backends

Smarter Computing

For Example

Aggregation

Baseball Data

aggregate

aggregate(H ~ teamID, Batting, mean, na.rm=TRUE) %>% head(10)

   teamID        H
1     ARI 31.44842
2     ATL 34.29728
3     BAL 43.39115
4     BOS 44.10159
5     CHA 42.56100
6     CHN 39.27246
7     CIN 38.98270
8     CLE 42.39974
9     COL 32.42434
10    DET 44.38779

tapply

tapply(Batting$H, INDEX=Batting$teamID, mean, na.rm=TRUE)

     ARI      ATL      BAL      BOS      CHA      CHN      CIN      CLE 
31.44842 34.29728 43.39115 44.10159 42.56100 39.27246 38.98270 42.39974 
     COL      DET      HOU      KCA      LAA      LAN      MIA      MIL 
32.42434 44.38779 34.30139 47.20915 38.96901 34.73060 26.36735 32.17872 
     MIN      NYA      NYN      OAK      PHI      PIT      SDN      SEA 
47.86924 44.09629 32.02503 41.54868 38.69329 38.92459 31.98062 46.84478 
     SFN      SLN      TBA      TEX      TOR      WAS 
35.19410 38.18787 39.22337 48.04941 49.83239 34.17968

plyr

library(plyr)
ddply(Batting, 'teamID', summarize, Hits=mean(H, na.rm=TRUE)) %>% head(10)

   teamID     Hits
1     ARI 31.44842
2     ATL 34.29728
3     BAL 43.39115
4     BOS 44.10159
5     CHA 42.56100
6     CHN 39.27246
7     CIN 38.98270
8     CLE 42.39974
9     COL 32.42434
10    DET 44.38779

data.table

library(data.table)
batDT <- data.table(Batting)
batDT[, list(H=mean(H, na.rm=TRUE)), by=teamID] %>% head(10)

    teamID        H
 1:    SFN 35.19410
 2:    CHN 39.27246
 3:    CHA 42.56100
 4:    BOS 44.10159
 5:    SEA 46.84478
 6:    NYA 44.09629
 7:    ATL 34.29728
 8:    BAL 43.39115
 9:    NYN 32.02503
10:    LAN 34.73060

dplyr

library(dplyr)
Batting %>% group_by(teamID) %>% summarize(H=mean(H, na.rm=TRUE)) %>% head(10)

Source: local data frame [10 x 2]

   teamID        H
1     ARI 31.44842
2     ATL 34.29728
3     BAL 43.39115
4     BOS 44.10159
5     CHA 42.56100
6     CHN 39.27246
7     CIN 38.98270
8     CLE 42.39974
9     COL 32.42434
10    DET 44.38779

Rcpp

NumericVector agger(DataFrame DF, std::string var, std::string id){
  NumericVector numbers = DF[var]; CharacterVector groupers = DF[id];
  CharacterVector onlyThese = unique(groupers);
  NumericVector calcResults(onlyThese.size());
  int n = groupers.size();
  std::map<SEXP, std::vector<double> > counts;
  for (int i = 0; i < n; ++i)
    counts[groupers[i]].push_back(numbers[i]);
  for(int i=0; i<onlyThese.size(); ++i){
    calcResults[i] = accumulate(counts[onlyThese[i]].begin(), 
    counts[onlyThese[i]].end(), 0.0) / counts[onlyThese[i]].size();
  }
  calcResults.names() = onlyThese;
  return calcResults;
}

     SEA      MIA      DET      BAL      BOS      CIN      COL      MIL 
      NA 26.36735       NA       NA       NA 38.98270 32.42434 32.17872

Let's Try Parallel

foreach

library(doParallel)
cl <- makeCluster(detectCores())
registerDoParallel(cl)
hAvg <- foreach(team=unique(Batting$teamID), .combine=rbind) %dopar%
    data.frame(teamID=team, H=mean(Batting[Batting$teamID == team, 'H'], na.rm=TRUE))
stopCluster(cl)
hAvg %>% head(8)

  teamID        H
1    SFN 35.19410
2    CHN 39.27246
3    CHA 42.56100
4    BOS 44.10159
5    SEA 46.84478
6    NYA 44.09629
7    ATL 34.29728
8    BAL 43.39115

plyr parallel

cl <- makeCluster(detectCores())
registerDoParallel(cl)
hAvg2 <- ddply(Batting, 'teamID', 
               function(x) c(H=mean(x$H, na.rm=TRUE)), .parallel=TRUE)
stopCluster(cl)
hAvg2 %>% head(10)

   teamID        H
1     ARI 31.44842
2     ATL 34.29728
3     BAL 43.39115
4     BOS 44.10159
5     CHA 42.56100
6     CHN 39.27246
7     CIN 38.98270
8     CLE 42.39974
9     COL 32.42434
10    DET 44.38779

Benchmark

More Benchmarks

Backends

bigmemory

library(bigmemory)
library(bigalgebra)
A <- big.matrix(5, 5, type="double", backingfile="A")
A[, ] <- 1:25
(A + A)[, ]

     [,1] [,2] [,3] [,4] [,5]
[1,]    2   12   22   32   42
[2,]    4   14   24   34   44
[3,]    6   16   26   36   46
[4,]    8   18   28   38   48
[5,]   10   20   30   40   50

ff

library(ffbase)
library(biglm)
diaFF <- as.ffdf(diamonds)
modff1 <- bigglm(price ~ carat + cut, data=diaFF, chunksize=10000)
summary(modff1)

Large data regression model: bigglm(price ~ carat + cut, data = diaFF, chunksize = 10000)
Sample size =  53940 
                  Coef       (95%        CI)      SE p
(Intercept) -2701.3760 -2732.2382 -2670.5139 15.4311 0
carat        7871.0821  7843.1229  7899.0414 13.9796 0
cut.L        1239.8004  1187.6004  1292.0005 26.1000 0
cut.Q        -528.5978  -574.8626  -482.3330 23.1324 0
cut.C         367.9099   327.4816   408.3383 20.2142 0
cut^4          74.5943    42.1151   107.0734 16.2396 0

dplyr with databases

dplyr

batDBSource <- src_sqlite(file.path(dataDir, "batting.sqlite3"))
batDB <- tbl(batDBSource, "Batting")
batDB

Source: sqlite 3.8.6 [../../data/batting.sqlite3]
From: Batting [73,444 x 24]

    playerID yearID stint teamID lgID   G G_batting  AB   R   H X2B X3B HR RBI
1  aardsda01   2004     1    SFN   NL  11        11   0   0   0   0   0  0   0
2  aardsda01   2006     1    CHN   NL  45        43   2   0   0   0   0  0   0
3  aardsda01   2007     1    CHA   AL  25         2   0   0   0   0   0  0   0
4  aardsda01   2008     1    BOS   AL  47         5   1   0   0   0   0  0   0
5  aardsda01   2009     1    SEA   AL  73         3   0   0   0   0   0  0   0
6  aardsda01   2010     1    SEA   AL  53         4   0   0   0   0   0  0   0
7  aardsda01   2012     1    NYA   AL   1        NA  NA  NA  NA  NA  NA NA  NA
8  aaronha01   1966     1    ATL   NL 158       158 603 117 168  23   1 44 127
9  aaronha01   1967     1    ATL   NL 155       155 600 113 184  37   3 39 109
10 aaronha01   1968     1    ATL   NL 160       160 606  84 174  33   4 29  86
..       ...    ...   ...    ...  ... ...       ... ... ... ... ... ... .. ...
Variables not shown: SB (int), CS (int), BB (int), SO (int), IBB (int), HBP
  (int), SH (int), SF (int), GIDP (int), G_old (int)

dplyr

batDB %>% group_by(teamID) %>% summarize(H=mean(H))

Source: sqlite 3.8.6 [../../data/batting.sqlite3]
From: <derived table> [?? x 2]

   teamID        H
1     ARI 31.44842
2     ATL 34.29728
3     BAL 43.39115
4     BOS 44.10159
5     CHA 42.56100
6     CHN 39.27246
7     CIN 38.98270
8     CLE 42.39974
9     COL 32.42434
10    DET 44.38779
..    ...      ...

SciDB

library(scidb)
scidbconnect(host="localhost")
batSci <- scidb("Batting")
aggregate(batSci, H ~ teamID, "avg(H) as H")

   teamID        H
1     ARI 31.44842
2     ATL 34.29728
3     BAL       NA
4     BOS       NA
5     CHA       NA
6     CHN 39.27246
7     CIN 38.98270
8     CLE       NA
9     COL 32.42434
10    DET       NA

Computing

data.table
dplyr
Rcpp

Backends

dplyr backed by database
bigmemory
ff
SciDB

Jared P. Lander

Chief Data Scientist of Lander Analytics
Author of R for Everyone
Adjunct Professor at Columbia University
Organizer of New York Open Statistical Programming (The R) Meetup
Website: http://www.jaredlander.com

5,013 Members

Performance

For Example

Baseball Data

aggregate

tapply

plyr

data.table

dplyr

Rcpp

Rcpp

foreach

plyr parallel

Benchmark

More Benchmarks

bigmemory

bigmemory

ff

ff

dplyr

dplyr

dplyr

dplyr

SciDB

SciDB

Computing

Backends

Jared P. Lander

The Tools