expand a data frame to have as many rows as range of two columns in original row [duplicate]

问题

I have a data frame as follows:

structure(list(symbol = c("u", "n", "v", "i", "a"), start = c(9L,
6L, 10L, 8L, 7L), end = c(14L, 15L, 12L, 13L, 11L)), .Names = c("symbol",
"start", "end"), class = "data.frame", row.names = c("1", "2",
"3", "4", "5"))

I want to as many rows as there are values in the range of (start, end) for each symbol. So, the final data frame will look like:

structure(list(symbol = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L,
4L, 4L, 5L, 5L, 5L, 5L, 5L), .Label = c("a", "l", "n", "v", "y"
), class = "factor"), value = c(7L, 8L, 9L, 10L, 11L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 13L, 14L, 8L, 9L, 10L, 11L, 12L, 10L,
11L, 12L, 13L, 14L, 15L, 9L, 10L, 11L, 12L, 13L)), class = "data.frame", row.names = c(NA,
-30L), .Names = c("symbol", "value"))

I was thinking I could simply have a list of values per row, and then use tidyr package's unnest as follows:

df$value <- apply(df, 1, function(x) as.list(x[2]:x[3]))
dput(df)
structure(list(symbol = structure(c(4L, 3L, 5L, 2L, 1L), .Label = c("a",
"i", "n", "u", "v"), class = "factor"), start = c(9L, 6L, 10L,
8L, 7L), end = c(14L, 15L, 12L, 13L, 11L), value = structure(list(
    `1` = list(9L, 10L, 11L, 12L, 13L, 14L), `2` = list(6L, 7L,
        8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), `3` = list(10L,
        11L, 12L), `4` = list(8L, 9L, 10L, 11L, 12L, 13L), `5` = list(
        7L, 8L, 9L, 10L, 11L)), .Names = c("1", "2", "3", "4",
"5"))), .Names = c("symbol", "start", "end", "value"), row.names = c("1",
"2", "3", "4", "5"), class = "data.frame")

df
  symbol start end                              value
1      u     9  14              9, 10, 11, 12, 13, 14
2      n     6  15 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
3      v    10  12                         10, 11, 12
4      i     8  13               8, 9, 10, 11, 12, 13
5      a     7  11                    7, 8, 9, 10, 11

Then do:

library(tidyr)
unnest(df, value)

However, I think I am hitting this pending feature/bug: https://github.com/tidyverse/tidyr/issues/278

Error: Each column must either be a list of vectors or a list of data frames [value]

Is there a better way to do this, especially avoiding apply family?

回答1:

With dplyr, we can use rowwise with do

library(dplyr)
df1 %>% 
   rowwise() %>% 
   do(data.frame(symbol= .$symbol, value = .$start:.$end)) %>% 
   arrange(symbol)
# A tibble: 30 x 2
#   symbol value
#    <chr> <int>
# 1      a     7
# 2      a     8
# 3      a     9
# 4      a    10
# 5      a    11
# 6      i     8
# 7      i     9
# 8      i    10
# 9      i    11
#10      i    12
# ... with 20 more rows

回答2:

You could use data.table and replicate the df by the required number of rows (based on the start and end for each symbol), then assign the value to each row after

library(data.table)

setDT(df)
df[rep(1:.N, (end - start + 1))][, value := (start - 1) + (1:.N), by = symbol][]

#    symbol start end value
# 1:      u     9  14     9
# 2:      u     9  14    10
# 3:      u     9  14    11
# 4:      u     9  14    12
# 5:      u     9  14    13
# ... etc

回答3:

Perhaps you could use map2 to add a column from which we can unnest into the desired result.

library(tidyverse)
df %>% 
  mutate(value = map2(start, end, ~ seq(from = .x, to = .y))) %>%
  select(symbol, value) %>%
  unnest()
#>    symbol    value
#> 1       u        9
#> 2       u       10
#> 3       u       11
#> 4       u       12
#> 5       u       13
#> 6       u       14
#> 7       n        6
#> 8       n        7
#> 9       n        8
#> 10      n        9
#> ...etc

来源：https://stackoverflow.com/questions/46841463/expand-a-data-frame-to-have-as-many-rows-as-range-of-two-columns-in-original-row

标签

tidyr