Here, I would like to give a quick summary of the commonly used functions for the two packages. My personal feeling is that I like dplyr better, because it's interface is more consistent and provide some convenience functions to manipulate columns to which I want to apply functions (I will talk about this in details later).
dplyr
|
data.tables
|
|
Select
columns
|
||
Select columns by name |
data
= select(mtcars, mpg, cyl, disp, wt)
var_name = c('mpg', 'cyl', 'disp', 'wt')
data
= select_ (mtcars, .dots = var_name)
data = select(mtcars, starts_with("m"))
data
= select(mtcars, ends_with("t"))
data
= select(mtcars, contains("s"))
#
select columns starts with, ends with, or contains some keywords.
This is the part I like dplyr slightly better. You can do such
things in data.tables as well, but need a little more coding
effort.
|
mtcars.dt
= data.table(mtcars)
var_name
= c('mpg', 'cyl', 'disp', 'wt')
data.dt
= mtcars.dt[, .(mpg, cyl, disp, wt)]
data.dt
= mtcars.dt[, var_name, with = F]
#
with=F would allow you to do normal data.frame operations in
data.tables.
|
Select columns by index |
data
= mtcars[, 2:5]
|
data.dt
= mtcars.dt[, 2:5, with = F]
|
Derived
columns
|
||
data
= mutate(mtcars,
x
= mpg/cyl,
mpg_new
= x*cyl,
mpg_mean
= mean(mpg)
)
#
newly added one can be used in later calculation
|
data.dt
= mtcars.dt[, x := mpg/cyl]
#
add a new column called x
data.dt
= mtcars.dt[, `:=`(x = mpg/cyl, mpg_mean = mean(mpg))][, mpg_new
:= x*cyl]
#
add two new columns, and use chain operator add the third column
|
|
Select
rows
|
||
Select
rows by condition
Comments:
select rows by condition in data.tables sometimes may not be
straightforward
|
data
= filter(mtcars, cyl==4, mpg>=30) # and
relation
data
= filter(mtcars, cyl==4 | mpg>=30) # or
relation
data
= filter(mtcars, cyl %in% c(4, 6))
|
data.dt
= mtcars.dt[cyl==4 & mpg>=30]
data.dt
= mtcars.dt[cyl==4 & vs==1]
#
you don't have to put a commar for the second dimension
setkey(mtcars.dt,
cyl, vs)
data.dt
= mtcars.dt[list(4, 1)]
data.dt
= mtcars.dt[.(4, 1)]
#
yo have to use list to select rows with cyl==4 and vs==1; .() is
the same as list()
data.dt
= mtcars.dt[cyl %in% c(4,6)]
data.dt2
= mtcars.dt[J(c(4,6))]
#
the same as above, cyl is either 4 or 6, use must use J() or .().
without that it will select the 4th and 6th
row (see below)
data.dt
= mtcars.dt[cyl %in% c(4,6) & vs==1]
#
cyl is 4 or 6, and vs==1
data.dt2
= mtcars.dt[.(c(6,4), 1)]
#
the same as above, .() is the same as J()
|
Select
rows by index
|
#
straightforward
|
data.dt
= mtcars.dt[c(6,4)]
#
the sixth and 4th row, NOT cyl== 4 or 6
|
Aggregation
functions
|
||
Aggregate
n values to 1 value
|
myfun
= function(x,y) return(mean(x/y))
data
= mtcars %>% group_by(vs) %>%
summarise(
avg_mpg
= mean(mpg),
avg_mpg_per_cyl
= mean(mpg/cyl),
avg_mpg_per_cyl2
= myfun(mpg, cyl),
row_cnt
= n()
)
|
myfun
= function(x,y) return(mean(x/y))
data.dt
= mtcars.dt[, list(
avg_mpg
= mean(mpg),
avg_mpg_per_cyl
= mean(mpg/cyl),
avg_mpg_per_cyl2
= myfun(mpg, cyl),
row_cnt
= .N), by = "vs"]
#
you have to use list() to wrap all the aggregation functions.
|
Window
function (n values to n values)
|
data
= mtcars %>% group_by(cyl) %>%
mutate(
lag_mpg
= lag(mpg),
mpg_rnk
= min_rank(desc(mpg))
)
%>% arrange(cyl, mpg)
var_name
= c('mpg', 'cyl', 'disp', 'wt')
data
= mtcars %>% group_by(cyl) %>%
mutate_each_(funs(lag),
var_name) %>%
arrange(cyl)
#
apply 1 functions to multiple columns. this will replace the
original column by the lag value.
data
= mtcars %>% group_by(cyl) %>%
mutate_each_(funs(lag,
mean), var_name) %>%arrange(cyl)
#
apply 2 functions to multiple columns. this will CREATE additional
columns for lag and mean, the origial column is also kept
|
mtcars.dt
= data.table(mtcars)
setkey(mtcars.dt,
cyl)
#
you don't have to setkey before using 'by='
mtcars.dt
= mtcars.dt[, mpg_lag := c(NA, mpg[-.N]), by = cyl]
#
data.table way to write lag function
mtcars.dt
= data.table(mtcars)
lag_var_name
= paste(var_name, "_lag", sep="")
mtcars.dt
= mtcars.dt[, (lag_var_name) := lapply(.SD, function(x){c(NA,
x[-.N])}), by = cyl, .SDcols = var_name]
#
you must use (lag_var_name) instead of lag_var_name. if you use
lag_var_name := xxxx, it will create one column called
"lag_var_name"
|
More
general (n to m values)
|
#
I don't think you have a direct way to do that. Use “mutate”
or “summarise” function, the output has to have n or one
variables respectively. You can write a function that returns one
list, and dplyr will create a column, with each cell contains a
list.
half_list
= function(l, na.last = TRUE, ties.method = "first"){
r
= rank(l, na.last =na.last,
ties.method = ties.method) return(l[r<=length(r)/2])
}
data
= mtcars %>% group_by(cyl) %>%
do(min.50pct.mpg
= half_list(.$mpg)) %>%
mutate(list.len
= length(min.50pct.mpg))
|
half_list = function(l, na.last = TRUE, ties.method = "first"){
r = rank(l, na.last =na.last,
ties.method = ties.method) return(l[r<=length(r)/2])
}
data.dt
= mtcars.dt[,
.(min.50pct.mpg = half_list(mpg), max.50pct.mpg = -half_list(-mpg)), by=cyl ]
#
data.table is more "smart", it will merge the newly
created list to the by-variables of the original data table. In
the case that multiple lists are created, they must have the same
length. Otherwise, it will repeat the shorter lists.
|