Create column identifying minimum character from within a group and label ties

守給你的承諾、 提交于 2019-12-23 23:09:38

问题


I have paired data for 10 subjects (with some missing and some ties). My goal is to select the eye with the best disc_grade (A > B > C) and label ties accordingly from the data frame below.

I'm stuck on how to use R code to select the rows with the best disc_grade for each subject.

df <- structure(list(patientID = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 
6, 7, 7, 8, 8, 9, 9, 10, 10), eye = c("R", "L", "R", "L", "R", 
"L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", 
"R", "L"), disc_grade = c(NA, "B", "C", "B", "B", "C", "B", "C", 
"B", "A", "B", "B", "C", "B", NA, NA, "B", "C", "B", "C")), .Names = c("patientID", "eye", "disc_grade"), class = c("tbl_df", "data.frame"), row.names = c(NA, -20L))

The desired output is:

   patientID   eye disc_grade
2          1   L          B
4          2   L          B
5          3   R          B
7          4   R          B
10         5   L          A
11         6   Tie        B
14         7   L          B
17         9   R          B
19        10   R          B

回答1:


This seems to work:

df %>% 
  group_by(patientID) %>% 
  filter(disc_grade == min(disc_grade, na.rm=TRUE)) %>%
  summarise(eye = if (n()==1) eye else "Tie", disc_grade = first(disc_grade))

  patientID   eye disc_grade
      (dbl) (chr)      (chr)
1         1     L          B
2         2     L          B
3         3     R          B
4         4     R          B
5         5     L          A
6         6   Tie          B
7         7     L          B
8         9     R          B
9        10     R          B

There is a warning for group 8, but we get the desired result thanks to how filter works on NAs.


With data.table:

setDT(df)[, 
  .SD[ disc_grade == min(disc_grade, na.rm=TRUE) ][,
    .( eye = if (.N==1) eye else "Tie", disc_grade = disc_grade[1] )
  ]
, by=patientID]

Again, there's a warning, but now we do get a row for group 8, since [ does not ignore NAs. To get around this, you could filter the NAs before or after the operation (as in other answers). My best idea for doing it during the main operation is pretty convoluted:

setDT(df)[, 
  .SD[ which(disc_grade == min(disc_grade, na.rm=TRUE)) ][,
    if (.N >= 1) list( eye = if (.N==1) eye else "Tie", disc_grade = disc_grade[1] )
  ]
, by=patientID]



回答2:


One option with data.table

library(data.table)
na.omit(setDT(df))[, eye:=if(uniqueN(disc_grade)==1 & 
            .N >1)  'Tie'  else eye, patientID
         ][order(factor(disc_grade, levels=c('A', 'B', 'C'))),
              .SD[1L] ,patientID][order(patientID)]
#    patientID eye disc_grade
#1:         1   L          B
#2:         2   L          B
#3:         3   R          B
#4:         4   R          B
#5:         5   L          A
#6:         6 Tie          B
#7:         7   L          B
#8:         9   R          B
#9:        10   R          B



回答3:


library(dplyr)

df <- structure(list(patientID = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 
                                   6, 7, 7, 8, 8, 9, 9, 10, 10), eye = c("R", "L", "R", "L", "R", 
                                                                         "L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", "R", "L", 
                                                                         "R", "L"), disc_grade = c(NA, "B", "C", "B", "B", "C", "B", "C", 
                                                                                                   "B", "A", "B", "B", "C", "B", NA, NA, "B", "C", "B", "C")), .Names = c("patientID", "eye", "disc_grade"), class = c("tbl_df", "data.frame"), row.names = c(NA, -20L))



df %>%
  filter(!is.na(disc_grade)) %>%                     ## remove rows with NAs
  group_by(patientID) %>%                            ## for each patient
  filter(disc_grade == min(disc_grade)) %>%          ## keep the row (his eye) that has the best score
  mutate(eye_upd = ifelse(n() > 1, "tie", eye)) %>%  ## if you kept both eyes you have a tie
  select(patientID,eye_upd,disc_grade) %>%
  distinct()

#    patientID eye_upd disc_grade
#        (dbl)   (chr)     (fctr)
# 1         1       L          B
# 2         2       L          B
# 3         3       R          B
# 4         4       R          B
# 5         5       L          A
# 6         6     tie          B
# 7         7       L          B
# 8         9       R          B
# 9        10       R          B



回答4:


There's certainly a better way to do this, but this gets the job done...need more coffee...

df_orig <- df

library(dplyr)

df %>%
  filter(!is.na(disc_grade)) %>%
  group_by(patientID) %>%
  summarise(best = min(disc_grade)) %>%
  left_join(., df_orig, by = c("patientID" = "patientID",
                               "best" = "disc_grade")) %>%
  group_by(patientID) %>%
  mutate(eye = ifelse(n() > 1, "tie", eye)) %>%
  distinct(patientID) %>% 
  select(patientID, eye, best)

Note: I am able to get away with min(disc_grade) because of type conversation. Consider looking at as.numeric(as.factor(df$disc_grade)).



来源:https://stackoverflow.com/questions/33568497/create-column-identifying-minimum-character-from-within-a-group-and-label-ties

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!