(Update) Add index column to data.frame based on two columns

放肆的年华 提交于 2019-12-02 04:39:30

We can use rleid

library(data.table)
index_col <-setDT(df)[, if(colB[1L] < .N) ((seq_len(.N)-1) %/% colB[1L])+1  
       else as.numeric(colB), rleid(colB)][, rleid(V1)]
df[, index_col := index_col]
df
#    colA colB index_col
# 1:    2    7         1
# 2:    2    7         1
# 3:    2    7         1
# 4:    2    7         1
# 5:    1    7         1
# 6:    1    7         1
# 7:    1    7         1
# 8:   70    5         2
# 9:   70    5         2
#10:   70    5         2
#11:   69    5         2
#12:   69    5         2
#13:   89    5         3
#14:   89    5         3
#15:   89    5         3
#16:   88    5         3
#17:   88    5         3
#18:  120    1         4
#19:  100    1         5

Or a one-liner would be

setDT(df)[, index_col := df[, ((seq_len(.N)-1) %/% colB[1L])+1, rleid(colB)][, as.integer(interaction(.SD, drop = TRUE, lex.order = TRUE))]]

Update

Based on the new update in the OP's post

setDT(new_df)[, index_col :=  cumsum(c(TRUE, abs(diff(colA))> 1))
          ][, colB := .N , index_col]
new_df
#    colA colB index_col
# 1:    3   10         1
# 2:    3   10         1
# 3:    3   10         1
# 4:    2   10         1
# 5:    2   10         1
# 6:    2   10         1
# 7:    2   10         1
# 8:    1   10         1
# 9:    1   10         1
#10:    1   10         1
#11:   71    7         2
#12:   71    7         2
#13:   70    7         2
#14:   70    7         2
#15:   70    7         2
#16:   69    7         2
#17:   69    7         2
#18:   90    7         3
#19:   90    7         3
#20:   89    7         3
#21:   89    7         3
#22:   89    7         3
#23:   88    7         3
#24:   88    7         3
#25:   44    2         4
#26:   43    2         4
#27:  120    1         5
#28:  100    1         6

An approach in base R:

df$idxcol <- cumsum(c(1,abs(diff(df$colA)) > 1) + c(0,diff(df$colB) != 0) > 0)

which gives:

> df
   colA colB idxcol
1     2    7      1
2     2    7      1
3     2    7      1
4     2    7      1
5     1    7      1
6     1    7      1
7     1    7      1
8    70    5      2
9    70    5      2
10   70    5      2
11   69    5      2
12   69    5      2
13   89    5      3
14   89    5      3
15   89    5      3
16   88    5      3
17   88    5      3
18  120    1      4
19  100    1      5

On the updated example data, you need to adapt the approach to:

n <- 1
idx1 <- cumsum(c(1, diff(df$colA) < -n) + c(0, diff(df$colB) != 0) > 0)
idx2 <- ave(df$colA, cumsum(c(1, diff(df$colA) < -n)), FUN = function(x) c(0, cumsum(diff(x)) < -n ))
idx2[idx2==1 & c(0,diff(idx2))==0] <- 0

df$idxcol <- idx1 + cumsum(idx2)

which gives:

> df
   colA colB idxcol
1     2    7      1
2     2    7      1
3     2    7      1
4     2    7      1
5     1    7      1
6     1    7      1
7     1    7      1
8    89    5      2
9    89    5      2
10   89    5      2
11   88    5      2
12   88    5      2
13   70    5      3
14   70    5      3
15   70    5      3
16   69    5      3
17   69    5      3
18   44    4      4
19   44    4      4
20   44    4      4
21   43    4      4
22   42    4      5
23   42    4      5
24   41    4      5
25   41    4      5
26  120    1      6
27  100    1      7

For new_df just change n tot 2 and you will get the desired output for that as well.

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!