Create column identifying minimum character from within a group and label ties

问题

I have paired data for 10 subjects (with some missing and some ties). My goal is to select the eye with the best disc_grade (A > B > C) and label ties accordingly from the data frame below.

I'm stuck on how to use R code to select the rows with the best disc_grade for each subject.

df <- structure(list(patientID = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 
6, 7, 7, 8, 8, 9, 9, 10, 10), eye = c("R", "L", "R", "L", "R", 
"L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", 
"R", "L"), disc_grade = c(NA, "B", "C", "B", "B", "C", "B", "C", 
"B", "A", "B", "B", "C", "B", NA, NA, "B", "C", "B", "C")), .Names = c("patientID", "eye", "disc_grade"), class = c("tbl_df", "data.frame"), row.names = c(NA, -20L))

The desired output is:

   patientID   eye disc_grade
2          1   L          B
4          2   L          B
5          3   R          B
7          4   R          B
10         5   L          A
11         6   Tie        B
14         7   L          B
17         9   R          B
19        10   R          B

回答1:

This seems to work:

df %>% 
  group_by(patientID) %>% 
  filter(disc_grade == min(disc_grade, na.rm=TRUE)) %>%
  summarise(eye = if (n()==1) eye else "Tie", disc_grade = first(disc_grade))

  patientID   eye disc_grade
      (dbl) (chr)      (chr)
1         1     L          B
2         2     L          B
3         3     R          B
4         4     R          B
5         5     L          A
6         6   Tie          B
7         7     L          B
8         9     R          B
9        10     R          B

There is a warning for group 8, but we get the desired result thanks to how filter works on NAs.

With data.table:

setDT(df)[, 
  .SD[ disc_grade == min(disc_grade, na.rm=TRUE) ][,
    .( eye = if (.N==1) eye else "Tie", disc_grade = disc_grade[1] )
  ]
, by=patientID]

Again, there's a warning, but now we do get a row for group 8, since [ does not ignore NAs. To get around this, you could filter the NAs before or after the operation (as in other answers). My best idea for doing it during the main operation is pretty convoluted:

setDT(df)[, 
  .SD[ which(disc_grade == min(disc_grade, na.rm=TRUE)) ][,
    if (.N >= 1) list( eye = if (.N==1) eye else "Tie", disc_grade = disc_grade[1] )
  ]
, by=patientID]

回答2:

One option with data.table

library(data.table)
na.omit(setDT(df))[, eye:=if(uniqueN(disc_grade)==1 & 
            .N >1)  'Tie'  else eye, patientID
         ][order(factor(disc_grade, levels=c('A', 'B', 'C'))),
              .SD[1L] ,patientID][order(patientID)]
#    patientID eye disc_grade
#1:         1   L          B
#2:         2   L          B
#3:         3   R          B
#4:         4   R          B
#5:         5   L          A
#6:         6 Tie          B
#7:         7   L          B
#8:         9   R          B
#9:        10   R          B

回答3:

library(dplyr)

df <- structure(list(patientID = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 
                                   6, 7, 7, 8, 8, 9, 9, 10, 10), eye = c("R", "L", "R", "L", "R", 
                                                                         "L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", 
                                                                         "R", "L"), disc_grade = c(NA, "B", "C", "B", "B", "C", "B", "C", 
                                                                                                   "B", "A", "B", "B", "C", "B", NA, NA, "B", "C", "B", "C")), .Names = c("patientID", "eye", "disc_grade"), class = c("tbl_df", "data.frame"), row.names = c(NA, -20L))



df %>%
  filter(!is.na(disc_grade)) %>%                     ## remove rows with NAs
  group_by(patientID) %>%                            ## for each patient
  filter(disc_grade == min(disc_grade)) %>%          ## keep the row (his eye) that has the best score
  mutate(eye_upd = ifelse(n() > 1, "tie", eye)) %>%  ## if you kept both eyes you have a tie
  select(patientID,eye_upd,disc_grade) %>%
  distinct()

#    patientID eye_upd disc_grade
#        (dbl)   (chr)     (fctr)
# 1         1       L          B
# 2         2       L          B
# 3         3       R          B
# 4         4       R          B
# 5         5       L          A
# 6         6     tie          B
# 7         7       L          B
# 8         9       R          B
# 9        10       R          B

回答4:

There's certainly a better way to do this, but this gets the job done...need more coffee...

df_orig <- df

library(dplyr)

df %>%
  filter(!is.na(disc_grade)) %>%
  group_by(patientID) %>%
  summarise(best = min(disc_grade)) %>%
  left_join(., df_orig, by = c("patientID" = "patientID",
                               "best" = "disc_grade")) %>%
  group_by(patientID) %>%
  mutate(eye = ifelse(n() > 1, "tie", eye)) %>%
  distinct(patientID) %>% 
  select(patientID, eye, best)

Note: I am able to get away with min(disc_grade) because of type conversation. Consider looking at as.numeric(as.factor(df$disc_grade)).

来源：https://stackoverflow.com/questions/33568497/create-column-identifying-minimum-character-from-within-a-group-and-label-ties

标签

data.table

dplyr

subset