问题
id random count
a 0 -1
a 1 1
a 1 2
a 0 -1
a 0 -2
a 1 1
a 0 -1
a 1 1
a 0 -1
b 0 -1
b 0 -2
b 1 1
b 0 -1
b 1 1
b 0 -1
b 0 -2
b 0 -3
id
is a player , random
is binary 0
or 1
, I want to create a count column that counts the sequences of 1's and 0's by player , preferably without loops since the database is very big.
回答1:
Here's a dplyr
solution
dat %>%
transform(idx = c(0,cumsum(random[-1L] != random[-length(random)]))) %>%
group_by(id, idx) %>%
mutate(count = -1*cumsum(random == 0) + cumsum(random == 1)) %>%
ungroup() %>%
select(-idx)
Source: local data frame [17 x 3]
id random count
1 a 0 -1
2 a 1 1
3 a 1 2
4 a 0 -1
5 a 0 -2
6 a 1 1
7 a 0 -1
8 a 1 1
9 a 0 -1
10 b 0 -1
11 b 0 -2
12 b 1 1
13 b 0 -1
14 b 1 1
15 b 0 -1
16 b 0 -2
17 b 0 -3
回答2:
I think this is what you're looking for:
library(data.table)
setDT(DF)[, count := seq_len(.N), by=.(id,rleid(random))]
which gives
id random count
1: a 0 1
2: a 1 1
3: a 1 2
4: a 0 1
5: a 0 2
6: a 1 1
7: a 0 1
8: a 1 1
9: a 0 1
10: b 0 2
11: b 0 3
12: b 1 1
13: b 0 1
14: b 1 1
15: b 0 1
16: b 0 2
17: b 0 3
(In the next version of the data.table package, 1.9.8, there will be a small shortcut setDT(DF)[, count := rowid(rleid(random)), by=id]
. I am making this note so I can update the answer later.)
You may also want identifiers for groups of runs:
DF[, rid := rleid(random), by=id]
which gives
id random count rid
1: a 0 1 1
2: a 1 1 2
3: a 1 2 2
4: a 0 1 3
5: a 0 2 3
6: a 1 1 4
7: a 0 1 5
8: a 1 1 6
9: a 0 1 7
10: b 0 1 1
11: b 0 2 1
12: b 1 1 2
13: b 0 1 3
14: b 1 1 4
15: b 0 1 5
16: b 0 2 5
17: b 0 3 5
If you read through the introductory materials on the package, you'll see that these variables can also be created in a single step.
回答3:
I think the easiest way to achieve this is streak_run
function from runner package. streak_run
is also fastest as shown in below section
Solution
library(runner)
df <- data.frame( id = 1:10, random = sample(c(0,1), 10, replace=T))
df$count <- streak_run(df$random)
df$count[df$random==0] <- -df$count[df$random==0]
df
# id random count
#1 1 0 -1
#2 2 0 -2
#3 3 1 1
#4 4 1 2
#5 5 1 3
#6 6 1 4
#7 7 0 -1
#8 8 0 -2
#9 9 0 -3
#10 10 0 -4
Benchmarks
runner_example <- function(df){
df$count <- streak_run(df$random)
df$count[df$random==0] <- -df$count[df$random==0]
return(df)}
dplyr_example <- function(df){
df %>%
transform(idx = c(0,cumsum(random[-1L] != random[-length(random)]))) %>%
group_by(id, idx) %>%
mutate(count = -1*cumsum(random == 0) + cumsum(random == 1)) %>%
ungroup() %>%
select(-idx)
return(df)}
dt_example <- function(df){
setDT(df)[, count := seq_len(.N), by=.(id,rleid(random))]
return(df)}
library(dplyr);library(data.table)
library(microbenchmark); library(magrittr)
df <- data.frame( id = 1:2000L, random = sample(letters[1:2], 2000L, replace=T))
microbenchmark(
dplyr = dplyr_example(df),
dt = dt_example(df),
runner = runner_example(df),
times=100
)
#Unit: microseconds
# expr min lq mean median uq max neval
# dplyr 134388.839 164274.611 204478.048 188548.4975 222777.298 526019.563 100
# dt 1306.139 1710.665 2181.989 1941.3420 2380.953 5581.682 100
# runner 284.522 741.145 1022.456 853.5715 1004.553 7398.019 100
来源:https://stackoverflow.com/questions/32788497/counts-sequences-in-r