data.table is “SQL-like”

DT[i, j, by]

data.table behaves like a database

A data.table acts like an in-memory RDBMS:

data.table differences to SQL Server

There are some differences that need to be mentioned:

data.table cookbook

Single table basics

Task Generic syntax Example(s)*
CREATE data.table(…) setDT() data.table(a=1:3 , b=LETTERS[1:3]) data.table(iris)
PRIMARY KEY data.table(…,key) setkey() data.table(a=1:3 , b=LETTERS[1:3], key="b") setkey(data.table(iris),Species)
SELECT basic DT[ , .( cols )] irisDT[ , .(Species, Sepal.Length)]
SELECT alias DT[ , .( a=col )] irisDT[ , .(Species, Length=Sepal.Length)]
SELECT COUNT DT[ , .N] irisDT[ ,.N]
SELECT COUNT DISTINCT DT[ , uniqueN(cols)] irisDT[ ,uniqueN(.SD)]
SELECT aggregation DT[ , .( sum(col) , .N )] irisDT[ , .(Count=.N, Length=mean(Sepal.Length))]
WHERE exact on primary key DT[value] DT[value, ] irisDT["setosa"] irisDT["setosa", .(Count=.N)]
WHERE DT[condition] DT[condition, j, by] irisDT[Species=="setosa"] irisDT[Species=="setosa", .(Count=.N)]
WHERE BETWEEN DT[between(col, min, max)] DT[ col %between% c(min,max) ] irisDT[between(Sepal.Length, 1, 5)] irisDT[Sepal.Length %between% c(1,5)]
WHERE LIKE DT[like(col,pattern)] DT[ col %like% pattern ] irisDT[like(Species,"set")] irisDT[Species %like% "set"]
ORDER asc. DT[order(cols)] DT[order(cols), j, by] irisDT[order(Species)]
ORDER desc. DT[order(-cols)] DT[order(-cols), j, by] irisDT[order(-Species)]
ORDER multiple DT[order(cols)] DT[order(cols), j, by] irisDT[order(-Species, Petal.Width)]
GROUP BY single DT[i, j, by] irisDT[ ,.N, by=Species]
GROUP BY multiple DT[i, j, by] irisDT[ ,.N, by=.(Species,Width=Petal.Width)]
TOP head(DT, n) head(irisDT)
HAVING DT[i, j, by][condition] irisDT[ , .(Count=.N), by=Species][Count>25]
Sub-queries DT[…][…][…] irisDT[ , .(Sepal.Length=mean(Sepal.Length)), by=Species][Sepal.Length>6, .(Species)]

* Uses irisDT <- data.table(iris)


Task Generic syntax Example(s)*
INSERT DT <- rbindlist(DT, newDT) irisDT<-rbindlist( irisDT, irisDT[1] )
READ aka SELECT (see above) DT[ , .( cols )] irisDT[ , .(Species, Sepal.Length)]
UPDATE / ADD column DT[ , a := b ] irisDT[ , Sepal.Area := Sepal.Width * Sepal.Length]
UPDATE / ADD multiple columns DT[ , :=(a = b, c = d) ] irisDT[ , `:=`(CreatedDate = Sys.Date(), User = "Steph")]
UPDATE / ADD multiple columns by reference DT[ , (newcols):=vals ] irisDT[ , c("a","b"):=.(1,2)]
DELETE DT <- DT[!condition] irisDT <- irisDT[!(Species=="setosa" & Petal.Length>=1.5)]
DROP table DT <- NULL irisDT<-NULL
DROP column DT[,col:=NULL] iristDT[,Species:=NULL]

* Uses irisDT <- data.table(iris)


Task Generic syntax Example(s)*
Structure str(DT) str(irisDT)
Column Names colnames(DT) colnames(irisDT)
Summary stats summary(DT) summary(irisDT)
Retrieve primary key info key(DT) key(irisDT)
List all data.tables tables() tables()

* Uses irisDT <- data.table(iris)


Task Generic syntax Example(s)*
INNER JOIN Y[X, nomatch=0] lookupDT[irisDT,nomatch=0]
LEFT JOIN Y[X] lookupDT[irisDT]
FULL JOIN merge(X, Y, all=TRUE) merge(irisDT, lookupDT, all=TRUE)
CROSS JOIN optiRum::CJ.dt(X,Y) CJ.dt(irisDT, lookupDT)
UNION ALL rbindlist( list(X,Y), fill=TRUE ) rbindlist( list(irisDT, lookupDT), fill=TRUE )
UNION unique( rbindlist( list(X,Y), fill=TRUE ) ) unique( rbindlist( list(irisDT, lookupDT), fill=TRUE ) )
JOIN and AGGREGATE Y[X, cols, by] lookupDT[irisDT,.(count=.N),by=Band]

* Uses:

irisDT   <- data.table(iris, key="Species")
lookupDT <- data.table(Species=c("setosa", "virginica", "Blah"), Band=c("A", "B", "A"), key="Species")

Intermediate tasks

Task Generic syntax Example(s)*
SELECT dynamically DT[ , colnames , with=FALSE] , DT[ , .SD , .SDcols=colnames cols<-colnames(irisDT); irisDT[ , cols, with=FALSE] cols<-colnames(irisDT); irisDT[ , .SD, .SDcols=colnames]
GROUP BY dynamically DT[ , …, by=colnames] irisDT[,.N,by=c("Species")]
GROUP BY, ORDER BY group DT[ , … , keyby] irisDT[,.N,keyby=Species]
UPDATE / ADD column of summary stat DT[ , a := b ] irisDT[ , All.SL.Mean:=mean(Sepal.Length)]
UPDATE / ADD column by group DT[ , a := b, by] irisDT[ , Species.SL.Mean:=mean(Sepal.Length), by=Species]
TOP by group DT[ , head(.SD), by] irisDT[ , head(.SD,2) , by=Species]
Largest record DT[ which.max(col) ] irisDT[ which.max(Sepal.Length) ]
Largest record by group DT[ , .SD[ which.max(col) ], by] irisDT[ , .SD[ which.max(Sepal.Length) ], by=Species]
Cumulative total DT[ , cumsum(col) ] irisDT[ , cumsum(Sepal.Width)]
NEGATIVE SELECT DT[ , .SD, .SDcols=-“colname”] irisDT[ , .SD, .SDcols=-"Species"]
RANK DT[ , frank(col) ] irisDT[ , frank(Sepal.Length,ties.method="first")]
AGGREGATE multiple columns DT[ , lapply(.SD, sum)] irisDT[ , lapply(.SD,sum), .SDcols=-"Species"]
AGGREGATE multiple columns by group DT[ , lapply(.SD, sum), by] irisDT[ , lapply(.SD,sum), by=Species]
COUNT DISTINCT multiple columns by group DT[ , lapply(.SD, uniqueN), by] irisDT[ , lapply(.SD,uniqueN), by=Species]
COUNT NULL multiple columns by group DT[ , lapply(.SD, function(x) sum(, by] irisDT[ , lapply(.SD,function(x) sum(, by=Species]
PIVOT data - to single value column melt(DT,…) melt(irisDT)
PIVOT data - to aggregate dcast(DT, a~b, function) dcast(melt(irisDT), Species ~ variable, sum)
Convert a large data.frame or list setDT() iris<-iris; setDT(iris)
ROW_NUMBER DT[ , .I] irisDT[ , .I]
GROUP number DT[, .GRP ,by] irisDT[ , .GRP, by=Species]

* Uses irisDT <- data.table(iris)

Advanced tasks

Task Generic syntax Example(s)*
GROUP BY each new incidence of group DT[ , cols , by=(col, rleid(col))] irisDT[order(Sepal.Length), .N, by=.(Species, rleid(Species))]
Calculate using (previous/next) N row DT[ , shift( cols, n)] irisDT[ , prev.Sepal.Length:=shift(Sepal.Length), by=Species ]
ORDER underlying table setorder() setorder(irisDT,Species)
JOIN & GROUP by keys X[Y, .N, by=.EACHI] irisDT[lookupDT, .N, by=.EACHI]
TRANSPOSE data.table transpose(DT) transpose(irisDT)
Split string to columns DT[, tstrsplit(charCol, pattern) ] irisDT[ , tstrsplit(as.character(Species),"e")]

* Uses:

irisDT   <- data.table(iris, key="Species")
lookupDT <- data.table(Species=c("setosa", "virginica", "Blah"), Band=c("A", "B", "A"), key="Species")

More resources