I am looking for a solution to the problem below that would be supported in pipes.
I have data that looks like this:
tibble(
column_set_1_1 = c(1, 2,
Here is my solution using tidyverse
tools:
library(dplyr)
library(tidyr)
library(stringr)
library(tibble)
get_last_nonNA <- function(vec) {
return(last(vec[!is.na(vec)]))
}
convert_table_last_nonNA <- . %>%
rownames_to_column() %>%
gather(key=column_type, value=value, -rowname) %>%
mutate(column_set=str_extract(string=column_type,
pattern="[0-9]+")) %>%
group_by(column_set, rowname) %>%
summarise(last_nonNA_value=get_last_nonNA(value)) %>%
spread(key=column_set, value=last_nonNA_value) %>%
select(-rowname) %>%
select(colnames(.) %>% as.integer() %>% order()) %>%
"colnames<-"(paste0("column_set_", colnames(.)))
# Usage
data_tbl <- tibble(
column_set_1_1 = c(1, 2, 3), column_set_1_2 = c(2, 3, NA),
column_set_1_3 = c(3, NA, NA), column_set_2_1 = c(1, 2, 3),
column_set_2_2 = c(4, 5, 6), column_set_2_3 = c(7, 8, 9),
column_set_2_4 = c(10, 11, NA), column_set_2_5 = c(13, NA, NA),
column_set_2_6 = c(NA, NA, NA)
)
convert_table_last_nonNA(data_tbl)
# # A tibble: 3 × 2
# column_set_1 column_set_2
# * <dbl> <dbl>
# 1 3 13
# 2 3 11
# 3 3 9
What it does, step by step:
convert_table_last_nonNA <- . %>%
;rownames_to_column()
in order to have information for extracting the last non-NA data per row;gather(key=column_type, value=value, -rowname)
: the rows represent now a combination of key columns (rowname
and column_type
) and value (value
);column_type
strings) and stores it in separate column column_set
. This is done with mutate(column_set=str_extract(string=column_type, pattern="[0-9]+"))
;group_by(column_set, rowname) %>% summarise(last_nonNA_value=get_last_nonNA(value))
. That is "for every combination of column_set
and rowname
give the last nonNA value of value
(via get_last_nonNA
call) and stores it in column last_nonNA_value
". Note: if there are only NA
's for some combination of column_set
and rowname
the result will be NA;spread(key=column_set, value=last_nonNA_value)
. Now there is a column for every item in column_set
and their values are last_nonNA_value
s;rowname
because it is not needed any more;column_set_10
will be placed directly after column_set_1
). This is done with select(colnames(.) %>% as.integer() %>% order())
;column_set_
to column names with "colnames<-"(paste0("column_set_", colnames(.)))
.Here is a tidyverse
approach without reshaping the original data frame but split it into groups by the column names pattern, and use coalesce
function to get the last non-NA values in each sub data frame:
library(tidyverse)
df_foo %>%
mutate_all(as.numeric) %>%
split.default(f = sub("_\\d+$", "", names(.))) %>%
map_df(~do.call(coalesce, setNames(rev(.), NULL)))
# A tibble: 3 × 2
# column_set_1 column_set_2
# <dbl> <dbl>
#1 3 13
#2 3 11
#3 3 9
Here is a solution that I came up with that works with pipes:
df_foo %>%
gather(key = Key, value = Value, -ID) %>%
mutate(set = str_extract(Key, "column_set_[0-9]")) %>%
mutate(number = str_extract(Key, "(?<=column_set_[0-9]_)[0-9]+")) %>%
group_by(ID, set) %>%
dplyr::filter(!is.na(Value)) %>%
arrange(number) %>%
slice(n()) %>%
select(-number, -Key) %>%
spread(key = set, value = Value)
I don't like the fact that I have to arrange
and then slice
out the last row -- seems inelegant to me. Any improvements welcome.