Add missing rows to data.table according to multiple keyed columns

问题

I have a data.table object that contains multiple columns that specify unique cases. In the small example below, the variables "name", "job", and "sex" specify the unique IDs. I would like to add missing rows so that each each case has a row for each possible instance of another variable, "from" (similar to expand.grid).

library(data.table)
set.seed(1)
mydata <- data.table(name = c("john","john","john","john","mary","chris","chris","chris"),
                 job = c("teacher","teacher","teacher","teacher","police","lawyer","lawyer","doctor"),
                 sex = c("male","male","male","male","female","female","male","male"),
                 from = c("NYT","USAT","BG","TIME","USAT","BG","NYT","NYT"),
                 score = rnorm(8))

setkeyv(mydata, cols=c("name","job","sex"))

mydata[CJ(unique(name, job, sex), unique(from))]

Here's the current data.table object:

> mydata
    name     job    sex from      score
1:  john teacher   male  NYT -0.6264538
2:  john teacher   male USAT  0.1836433
3:  john teacher   male   BG -0.8356286
4:  john teacher   male TIME  1.5952808
5:  mary  police female USAT  0.3295078
6: chris  lawyer female   BG -0.8204684
7: chris  lawyer   male  NYT  0.4874291
8: chris  doctor   male  NYT  0.7383247

Here's the result I'd like:

> mydata
     name     job    sex from      score
1:   john teacher   male  NYT -0.6264538
2:   john teacher   male USAT  0.1836433
3:   john teacher   male   BG -0.8356286
4:   john teacher   male TIME  1.5952808
5:   mary  police female  NYT  NA
6:   mary  police female USAT  0.3295078
7:   mary  police female   BG  NA
8:   mary  police female TIME  NA
9:  chris  lawyer female  NYT -NA
10: chris  lawyer female USAT -NA
11: chris  lawyer female   BG -0.8204684
12: chris  lawyer female TIME -NA
13: chris  lawyer   male  NYT  0.4874291
14: chris  lawyer   male USAT  NA
15: chris  lawyer   male   BG  NA
16: chris  lawyer   male TIME  NA
17: chris  doctor   male  NYT  0.7383247
18: chris  doctor   male USAT  NA
19: chris  doctor   male   BG  NA
20: chris  doctor   male TIME  NA

Here's what I've tried:

setkeyv(mydata, cols=c("name","job","sex"))
mydata[CJ(unique(name, job, sex), unique(from))]

But I receive the following error and adding fromLast=TRUE (or FALSE) does not give me the right solution:

Error in unique.default(name, job, sex) : 
  'fromLast' must be TRUE or FALSE

Here are the relevant answers I've come across (but none appears to deal with multiple keyed columns): add missing rows to a data table

Efficiently inserting default missing rows in a data.table

Fastest way to add rows for missing values in a data.frame?

回答1:

A couple of possibilities are here - https://github.com/Rdatatable/data.table/pull/814

CJ.dt = function(...) {
  rows = do.call(CJ, lapply(list(...), function(x) if(is.data.frame(x)) seq_len(nrow(x)) else seq_along(x)));
  do.call(data.table, Map(function(x, y) x[y], list(...), rows))
}

setkey(mydata, name, job, sex, from)

mydata[CJ.dt(unique(data.table(name, job, sex)), unique(from))]
#     name     job    sex from      score
# 1: chris  doctor   male  NYT  0.7383247
# 2: chris  doctor   male   BG         NA
# 3: chris  doctor   male TIME         NA
# 4: chris  doctor   male USAT         NA
# 5: chris  lawyer female  NYT         NA
# 6: chris  lawyer female   BG -0.8204684
# 7: chris  lawyer female TIME         NA
# 8: chris  lawyer female USAT         NA
# 9: chris  lawyer   male  NYT  0.4874291
#10: chris  lawyer   male   BG         NA
#11: chris  lawyer   male TIME         NA
#12: chris  lawyer   male USAT         NA
#13:  john teacher   male  NYT -0.6264538
#14:  john teacher   male   BG -0.8356286
#15:  john teacher   male TIME  1.5952808
#16:  john teacher   male USAT  0.1836433
#17:  mary  police female  NYT         NA
#18:  mary  police female   BG         NA
#19:  mary  police female TIME         NA
#20:  mary  police female USAT  0.3295078

回答2:

The dev version of tidyr now has an elegant way to do this because the expand() function now supports nesting and crossing:

library(dplyr)

mydata <- data_frame(
  name = c("john","john","john","john","mary","chris","chris","chris"),
  job = c("teacher","teacher","teacher","teacher","police","lawyer","lawyer","doctor"),
  sex = c("male","male","male","male","female","female","male","male"),
  from = c("NYT","USAT","BG","TIME","USAT","BG","NYT","NYT"),
  score = rnorm(8)
)

mydata %>% 
  expand(c(name, job, sex), from) %>% 
  left_join(mydata)

#> Joining by: c("name", "job", "sex", "from")
#> Source: local data frame [20 x 5]
#> 
#>     name     job    sex from      score
#> 1  chris  doctor   male   BG         NA
#> 2  chris  doctor   male  NYT  0.5448206
#> 3  chris  doctor   male TIME         NA
#> 4  chris  doctor   male USAT         NA
#> 5  chris  lawyer female   BG  1.2015173
#> 6  chris  lawyer female  NYT         NA
#> 7  chris  lawyer female TIME         NA
#> 8  chris  lawyer female USAT         NA
#> 9  chris  lawyer   male   BG         NA
#> 10 chris  lawyer   male  NYT -1.0930237
#> 11 chris  lawyer   male TIME         NA
#> 12 chris  lawyer   male USAT         NA
#> 13  john teacher   male   BG  1.1345461
#> 14  john teacher   male  NYT  1.3032946
#> 15  john teacher   male TIME  2.4901830
#> 16  john teacher   male USAT -1.6449096
#> 17  mary  police female   BG         NA
#> 18  mary  police female  NYT         NA
#> 19  mary  police female TIME         NA
#> 20  mary  police female USAT -0.2443080

回答3:

One possibility would be to paste the columns name, job, and sex together, get the unique values, and then do CJ with the unique values of from. After that, use cSplit from library(splitstackshape) to split the pasted column back to three columns, rename those columns with setnames, and join with mydata after setting the key.

library(splitstackshape)
library(data.table)
mydata1 <- setnames(cSplit(mydata[,CJ(unique(paste(name, job, sex)), 
            from=unique(from))], 'V1', ' '), 2:4, c('name', 'job', 'sex'))[,
                     c(2:4,1), with=FALSE]
setkeyv(mydata, cols=colnames(mydata)[1:4])
mydata[mydata1]
#      name     job    sex from      score
#1: chris  doctor   male   BG         NA
#2: chris  doctor   male  NYT  0.7383247
#3: chris  doctor   male TIME         NA
#4: chris  doctor   male USAT         NA
#5: chris  lawyer female   BG -0.8204684
#6: chris  lawyer female  NYT         NA
#7: chris  lawyer female TIME         NA
#8: chris  lawyer female USAT         NA
#9: chris  lawyer   male   BG         NA
#10: chris  lawyer   male  NYT  0.4874291
#11: chris  lawyer   male TIME         NA
#12: chris  lawyer   male USAT         NA
#13:  john teacher   male   BG -0.8356286
#14:  john teacher   male  NYT -0.6264538
#15:  john teacher   male TIME  1.5952808
#16:  john teacher   male USAT  0.1836433
#17:  mary  police female   BG         NA
#18:  mary  police female  NYT         NA
#19:  mary  police female TIME         NA
#20:  mary  police female USAT  0.3295078

来源：https://stackoverflow.com/questions/27372027/add-missing-rows-to-data-table-according-to-multiple-keyed-columns

标签

merge

data.table

cross-join