aggregate(H ~ teamID, Batting, mean, na.rm=TRUE) %>% head(10)
teamID H 1 ARI 31.44842 2 ATL 34.29728 3 BAL 43.39115 4 BOS 44.10159 5 CHA 42.56100 6 CHN 39.27246 7 CIN 38.98270 8 CLE 42.39974 9 COL 32.42434 10 DET 44.38779
tapply(Batting$H, INDEX=Batting$teamID, mean, na.rm=TRUE)
ARI ATL BAL BOS CHA CHN CIN CLE 31.44842 34.29728 43.39115 44.10159 42.56100 39.27246 38.98270 42.39974 COL DET HOU KCA LAA LAN MIA MIL 32.42434 44.38779 34.30139 47.20915 38.96901 34.73060 26.36735 32.17872 MIN NYA NYN OAK PHI PIT SDN SEA 47.86924 44.09629 32.02503 41.54868 38.69329 38.92459 31.98062 46.84478 SFN SLN TBA TEX TOR WAS 35.19410 38.18787 39.22337 48.04941 49.83239 34.17968
library(plyr) ddply(Batting, 'teamID', summarize, Hits=mean(H, na.rm=TRUE)) %>% head(10)
teamID Hits 1 ARI 31.44842 2 ATL 34.29728 3 BAL 43.39115 4 BOS 44.10159 5 CHA 42.56100 6 CHN 39.27246 7 CIN 38.98270 8 CLE 42.39974 9 COL 32.42434 10 DET 44.38779
library(data.table) batDT <- data.table(Batting) batDT[, list(H=mean(H, na.rm=TRUE)), by=teamID] %>% head(10)
teamID H 1: SFN 35.19410 2: CHN 39.27246 3: CHA 42.56100 4: BOS 44.10159 5: SEA 46.84478 6: NYA 44.09629 7: ATL 34.29728 8: BAL 43.39115 9: NYN 32.02503 10: LAN 34.73060
library(dplyr) Batting %>% group_by(teamID) %>% summarize(H=mean(H, na.rm=TRUE)) %>% head(10)
Source: local data frame [10 x 2] teamID H 1 ARI 31.44842 2 ATL 34.29728 3 BAL 43.39115 4 BOS 44.10159 5 CHA 42.56100 6 CHN 39.27246 7 CIN 38.98270 8 CLE 42.39974 9 COL 32.42434 10 DET 44.38779
NumericVector agger(DataFrame DF, std::string var, std::string id){ NumericVector numbers = DF[var]; CharacterVector groupers = DF[id]; CharacterVector onlyThese = unique(groupers); NumericVector calcResults(onlyThese.size()); int n = groupers.size(); std::map<SEXP, std::vector<double> > counts; for (int i = 0; i < n; ++i) counts[groupers[i]].push_back(numbers[i]); for(int i=0; i<onlyThese.size(); ++i){ calcResults[i] = accumulate(counts[onlyThese[i]].begin(), counts[onlyThese[i]].end(), 0.0) / counts[onlyThese[i]].size(); } calcResults.names() = onlyThese; return calcResults; }
SEA MIA DET BAL BOS CIN COL MIL NA 26.36735 NA NA NA 38.98270 32.42434 32.17872
library(doParallel) cl <- makeCluster(detectCores()) registerDoParallel(cl) hAvg <- foreach(team=unique(Batting$teamID), .combine=rbind) %dopar% data.frame(teamID=team, H=mean(Batting[Batting$teamID == team, 'H'], na.rm=TRUE)) stopCluster(cl) hAvg %>% head(8)
teamID H 1 SFN 35.19410 2 CHN 39.27246 3 CHA 42.56100 4 BOS 44.10159 5 SEA 46.84478 6 NYA 44.09629 7 ATL 34.29728 8 BAL 43.39115
cl <- makeCluster(detectCores()) registerDoParallel(cl) hAvg2 <- ddply(Batting, 'teamID', function(x) c(H=mean(x$H, na.rm=TRUE)), .parallel=TRUE) stopCluster(cl) hAvg2 %>% head(10)
teamID H 1 ARI 31.44842 2 ATL 34.29728 3 BAL 43.39115 4 BOS 44.10159 5 CHA 42.56100 6 CHN 39.27246 7 CIN 38.98270 8 CLE 42.39974 9 COL 32.42434 10 DET 44.38779
library(bigmemory) library(bigalgebra) A <- big.matrix(5, 5, type="double", backingfile="A") A[, ] <- 1:25 (A + A)[, ]
[,1] [,2] [,3] [,4] [,5] [1,] 2 12 22 32 42 [2,] 4 14 24 34 44 [3,] 6 16 26 36 46 [4,] 8 18 28 38 48 [5,] 10 20 30 40 50
library(ffbase) library(biglm) diaFF <- as.ffdf(diamonds) modff1 <- bigglm(price ~ carat + cut, data=diaFF, chunksize=10000) summary(modff1)
Large data regression model: bigglm(price ~ carat + cut, data = diaFF, chunksize = 10000) Sample size = 53940 Coef (95% CI) SE p (Intercept) -2701.3760 -2732.2382 -2670.5139 15.4311 0 carat 7871.0821 7843.1229 7899.0414 13.9796 0 cut.L 1239.8004 1187.6004 1292.0005 26.1000 0 cut.Q -528.5978 -574.8626 -482.3330 23.1324 0 cut.C 367.9099 327.4816 408.3383 20.2142 0 cut^4 74.5943 42.1151 107.0734 16.2396 0
batDBSource <- src_sqlite(file.path(dataDir, "batting.sqlite3")) batDB <- tbl(batDBSource, "Batting") batDB
Source: sqlite 3.8.6 [../../data/batting.sqlite3] From: Batting [73,444 x 24] playerID yearID stint teamID lgID G G_batting AB R H X2B X3B HR RBI 1 aardsda01 2004 1 SFN NL 11 11 0 0 0 0 0 0 0 2 aardsda01 2006 1 CHN NL 45 43 2 0 0 0 0 0 0 3 aardsda01 2007 1 CHA AL 25 2 0 0 0 0 0 0 0 4 aardsda01 2008 1 BOS AL 47 5 1 0 0 0 0 0 0 5 aardsda01 2009 1 SEA AL 73 3 0 0 0 0 0 0 0 6 aardsda01 2010 1 SEA AL 53 4 0 0 0 0 0 0 0 7 aardsda01 2012 1 NYA AL 1 NA NA NA NA NA NA NA NA 8 aaronha01 1966 1 ATL NL 158 158 603 117 168 23 1 44 127 9 aaronha01 1967 1 ATL NL 155 155 600 113 184 37 3 39 109 10 aaronha01 1968 1 ATL NL 160 160 606 84 174 33 4 29 86 .. ... ... ... ... ... ... ... ... ... ... ... ... .. ... Variables not shown: SB (int), CS (int), BB (int), SO (int), IBB (int), HBP (int), SH (int), SF (int), GIDP (int), G_old (int)
batDB %>% group_by(teamID) %>% summarize(H=mean(H))
Source: sqlite 3.8.6 [../../data/batting.sqlite3] From: <derived table> [?? x 2] teamID H 1 ARI 31.44842 2 ATL 34.29728 3 BAL 43.39115 4 BOS 44.10159 5 CHA 42.56100 6 CHN 39.27246 7 CIN 38.98270 8 CLE 42.39974 9 COL 32.42434 10 DET 44.38779 .. ... ...
library(scidb) scidbconnect(host="localhost") batSci <- scidb("Batting") aggregate(batSci, H ~ teamID, "avg(H) as H")
teamID H 1 ARI 31.44842 2 ATL 34.29728 3 BAL NA 4 BOS NA 5 CHA NA 6 CHN 39.27246 7 CIN 38.98270 8 CLE NA 9 COL 32.42434 10 DET NA