问题
I have a dataframe as follows and I would like to combine two columns, namely Var1
and Var2
. I want the combined column (Var3
) to contain no duplicates of <alpha><digit>
. i.e. if Var1 == A1
and Var2 == A1
, hence Var3 == A1
but not Var3 == A1-A1
or if Var1 == A4-E9
and Var2 == A4
, hence Var3 == A4-E9
but not Var3 == A4-E9-A4
df <- read.table(header = TRUE, text =
"id Var1 Var2
A A1 A1
B F2 A2
C NA A3
D A4-E9 A4
E E5 A5
F NA NA
G B2-R4 A3-B2
H B3-B4 E1-G5", stringsAsFactors = FALSE)
The following is my code. I would like to improve on its readability as well as get rid of the NA
that is present in row 3's entry for Var3
, i.e A3-NA
.
library(dplyr)
library(tidyr)
df %>%
mutate(Var3 = paste(Var1, Var2, sep = "-")) %>%
separate_rows(Var3, sep = "-") %>%
group_by(id, Var3) %>%
slice(1) %>%
group_by(id) %>%
mutate(Var3 = paste(unlist(Var3[!is.na(Var3)]), collapse = "-")) %>%
slice(1) %>%
ungroup
Here's my desired output:
# A tibble: 8 x 4
id Var1 Var2 Var3
<chr> <chr> <chr> <chr>
1 A A1 A1 A1
2 B F2 A2 A2-F2
3 C <NA> A3 A3
4 D A4-E9 A4 A4-E9
5 E E5 A5 A5-E5
6 F <NA> <NA> <NA>
7 G B2-R4 A3-B2 A3-B2-R4
8 H B3-B4 E1-G5 B3-B4-E1-G5
回答1:
if 'df1' is the output, then we remove the 'NA' that follows a -
with sub
df1 %>%
mutate(Var3 = sub("-NA", "", Var3))
# A tibble: 8 x 4
# id Var1 Var2 Var3
# <chr> <chr> <chr> <chr>
#1 A A1 A1 A1
#2 B F2 A2 A2-F2
#3 C <NA> A3 A3
#4 D A4-E9 A4 A4-E9
#5 E E5 A5 A5-E5
#6 F <NA> <NA> NA
#7 G B2-R4 A3-B2 A3-B2-R4
#8 H B3-B4 E1-G5 B3-B4-E1-G5
We can also do this slightly differently with tidyverse
by gather
into 'long' format, then split the 'value' column using separate_rows
, grouped by 'id', summarise
the 'Var3' column by paste
ing the sort
ed unique
elements of 'Var3' and left_join
with the original dataset 'df'
library(tidyverse)
gather(df, key, value, -id) %>%
separate_rows(value) %>%
group_by(id) %>%
summarise(Var3 = paste(sort(unique(value)), collapse='-')) %>%
mutate(Var3 = replace(Var3, Var3=='', NA)) %>%
left_join(df, .)
# id Var1 Var2 Var3
#1 A A1 A1 A1
#2 B F2 A2 A2-F2
#3 C <NA> A3 A3
#4 D A4-E9 A4 A4-E9
#5 E E5 A5 A5-E5
#6 F <NA> <NA> <NA>
#7 G B2-R4 A3-B2 A3-B2-R4
#8 H B3-B4 E1-G5 B3-B4-E1-G5
NOTE: The %>%
makes even a simple code to appear in multiple lines, but if required, we can put all those statements in a single line and term as one-liner
Here is a one-liner
library(data.table)
setDT(df)[, Var3 := paste(sort(unique(unlist(strsplit(unlist(.SD),"-")))), collapse="-"), id]
回答2:
You could do it in one line
df$Var3 = lapply(strsplit(paste(df$Var1, df$Var2, sep = "-"),"-"),
function(x)paste(unique(x)[unique(x)!="NA"],collapse="-"))
Output:
id Var1 Var2 Var3
1 A A1 A1 A1
2 B F2 A2 F2-A2
3 C <NA> A3 A3
4 D A4-E9 A4 A4-E9
5 E E5 A5 E5-A5
6 F <NA> <NA>
7 G B2-R4 A3-B2 B2-R4-A3
8 H B3-B4 E1-G5 B3-B4-E1-G5
- The first part in the lapply function is similar to your first call with dplyr. First the columns are concatenated, and then we split them again.
- The function within lapply removes all NA's, and then collapses the string again.
Hope this helps!
EDIT: Speed comparison for fun!
- 262,144 rows
Average runtimes:
- Florian: 3.97 seconds
- Sotos: 2.46 seconds
- Akrun: 1.34 seconds
- Adamm: >120 seconds
df <- read.table(header = TRUE, text =
"id Var1 Var2
A A1 A1
B F2 A2
C NA A3
D A4-E9 A4
E E5 A5
F NA NA
G B2-R4 A3-B2
H B3-B4 E1-G5", stringsAsFactors = FALSE)
for(i in 1:15)
{
df = rbind(df,df)
}
library(microbenchmark)
# Florian's method
microbenchmark(
lapply(strsplit(paste(df$Var1, df$Var2, sep = "-"),"-"),
function(x)paste(unique(x)[unique(x)!="NA"],collapse="-")),times=5)
# Sotos'method
microbenchmark(
gsub('NA-|-NA', '', vapply(strsplit(do.call(paste, df[-1]), " |-"), function(i) paste(unique(i), collapse = "-"), character(1L))), times=5)
# akrun method
library(data.table)
microbenchmark(
setDT(df)[, Var3 := paste(sort(unique(unlist(strsplit(unlist(.SD),"-")))), collapse="-"), id], times=5)
# Adamm method
microbenchmark(
sapply(1:nrow(df), function(i) ifelse(df[i,2]!=df[i,3] & !is.na(df[i,2]) & !is.na(df[i,3]), paste(df[i,2], df[i,3], sep="-"), ifelse(!is.na(df[i,3]), df[i,3], df[i,2]))), times=5)
回答3:
If you want complex solution; long one-liner, nested ifelse()
.
df$Var3 <- sapply(1:nrow(df), function(i) ifelse(df[i,2]!=df[i,3] & !is.na(df[i,2]) & !is.na(df[i,3]), paste(df[i,2], df[i,3], sep="-"), ifelse(!is.na(df[i,3]), df[i,3], df[i,2])))
> df
id Var1 Var2 Var3
1 A A1 A1 A1
2 B F2 A2 F2-A2
3 C <NA> A3 A3
4 D A4-E9 A4 A4-E9-A4
5 E E5 A5 E5-A5
6 F <NA> <NA> <NA>
7 G B2-R4 A3-B2 B2-R4-A3-B2
8 H B3-B4 E1-G5 B3-B4-E1-G5
In case of efficiency I made a small experiment and I measured time of each proposed solution, here are the results:
First of all I need more rows:
n <- 10000
df <- do.call("rbind", replicate(n, df, simplify = FALSE))
Akrun solution 1 with tidyverse
Time difference of 1.452809 secs
Akrun solution 2 with data.table
Time difference of 0.4530261 secs
Florian Maas solution with lapply
Time difference of 1.812106 secs
My solution with sapply
Time difference of 2.289345 mins
Sotos solution
Time difference of 1.515296 secs
来源:https://stackoverflow.com/questions/45137867/combining-columns-while-ignoring-duplicates-and-nas