data.table in 5 minutes

Steph Locke

2017-04-20

data.table

data.table basics

Task How
Read CSV irisDT <- fread("iris.csv")
Return everything irisDT irisDT[ ]
Select columns irisDT[ , .(Sepal.Length, Sepal.Width) ]
Update column irisDT[,Sepal.Area:=Sepal.Length*Sepal.Width]
Restrict rows irisDT[ Sepal.Length >=5 , ]
Aggregate irisDT[ , mean(Sepal.Length)]
Aggregate by group irisDT[ , mean(Sepal.Length) , Species ]
Count irisDT[ , .N ]

data.table awesomeness - IO

qRead<-fread("sample.csv")
## 
Read 76.9% of 13000 rows
Read 13000 rows and 13001 (of 13001) columns from 0.315 GB file in 00:00:13
system.time(
fwrite(qRead,"sample.csv"))
## 
Written 35.9% of 13000 rows in 2 secs using 32 threads. anyBufferGrown=no; maxBuffUsed=49%. Finished in 3 secs.      
                                                                                                                                     
##    user  system elapsed 
##  18.308   2.124   3.415

data.table awesomeness - groups

irisDT<-data.table(iris)
knitr::kable(
  irisDT[, .SD[which.min(Petal.Length)]
       , Species])
Species Sepal.Length Sepal.Width Petal.Length Petal.Width
setosa 4.6 3.6 1.0 0.2
versicolor 5.1 2.5 3.0 1.1
virginica 4.9 2.5 4.5 1.7