问题
I've a set of measurements that are taken over a period of days. The number of measurements is typically 4. The range of numbers that can be captured in any measurement is 1-5 (in real life, given the test set, the range could be as high as 100 or as low as 20).
I want to count, per day, how many of each value has happened prior to the current day.
Let me explain with some sample data:
# test data creation
d1 = list(as.Date("2013-5-4"), 4,2)
d2 = list(as.Date("2013-5-9"), 2,5)
d3 = list(as.Date("2013-5-16"), 3,2)
d4 = list(as.Date("2013-5-30"), 1,4)
d = rbind(d1,d2,d3,d4)
colnames(d) <- c("Date", "V1", "V2")
tt = as.data.table(d)
I want to run a function that will add 5 columns (1 per value possible in the range of possible values). in each of the columns I want the COUNT of the occurrences of that value prior to the test date.
For example, the output of the function for 2013-5-30 would be C1=0, C2=3, C3=1, C4=1, C5=1
.
It's counting how many times:
1 appeared before and not including 5/30, which is zero
2 appeared before and not including 5/30, which is three
3 appeared before and not including 5/30, which is one
etc.
Additionally, it should also include a column for what percentage of the total measurements that number appears. For instance on 5/30
, there were 6 measurements before 5/30 so
pc1=(0/6), pc2=3/6, pc3=1/6, pc4=1/6, pc5= 1/6
I would like to use the data.table assignment notation ( := ) to add these multiple columns all in one shot. The output that I'm looking for is of the format:
Date V1 V2 C1 PC1 C2 PC2 C3 PC3 C4 PC4 C5 PC5
回答1:
1. data.table
First replace the strange construct for t
in the question with a more usual one:
library(data.table)
t <- data.table(
Date = as.Date(c("2013-5-4", "2013-5-9", "2013-5-16", "2013-5-30")),
V1 = c(4, 2, 3, 1),
V2 = c(2, 5, 2, 4)
)
Now tabulate
each row and use cumsum
to accumulate prior rows. perm
is a permutation vector used to rearrange the column numbers of the C columns (nc + 1:n) and the PC columns (nc + n + 1:n).
nc <- ncol(t) # 3
n <- t[, max(V1, V2)] # 5
Cnames <- paste0("C", 1:n)
PCnames <- paste0("PC", 1:n)
perm <- c(1:nc, rbind(nc + 1:n, nc + n + 1:n))
t[, (Cnames) := as.list(tabulate(c(V1, V2), n)), by = 1:nrow(t)
][, (Cnames):=lapply(.SD, function(x) cumsum(x) - x), .SDcol=Cnames
][, (PCnames):=lapply(.SD, function(x) x/seq(0,len=.N,by=nc-1)), .SDcols=Cnames
][, ..perm]
The last line gives:
Date V1 V2 C1 PC1 C2 PC2 C3 PC3 C4 PC4 C5 PC5
1: 2013-05-04 4 2 0 NaN 0 NaN 0 NaN 0 NaN 0 NaN
2: 2013-05-09 2 5 0 0 1 0.5 0 0.0000000 1 0.5000000 0 0.0000000
3: 2013-05-16 3 2 0 0 2 0.5 0 0.0000000 1 0.2500000 1 0.2500000
4: 2013-05-30 1 4 0 0 3 0.5 1 0.1666667 1 0.1666667 1 0.1666667
1a.data.table alternative
If its ok to omit the row of the first date (which is not very useful since there are no dates prior to the first date) then we can perform the following tedious but straight forward self join:
t <- data.table(
Date = as.Date(c("2013-5-4", "2013-5-9", "2013-5-16", "2013-5-30")),
V1 = c(4, 2, 3, 1),
V2 = c(2, 5, 2, 4)
)
tt <- t[, one := 1]
setkey(tt, one)
tt[tt,,allow.cartesian=TRUE][Date > Date.1, list(
C1 = sum(.SD == 1), PC1 = mean(.SD == 1),
C2 = sum(.SD == 2), PC2 = mean(.SD == 2),
C3 = sum(.SD == 3), PC3 = mean(.SD == 3),
C4 = sum(.SD == 4), PC4 = mean(.SD == 4),
C5 = sum(.SD == 5), PC5 = mean(.SD == 5)
), by = list(Date, V1, V2), .SDcols = c("V1.1", "V2.1")]
1b. data.table alternative
or we can rewrite 1a more compactly as this (where tt
, n
, Cnames
and PCnames
are from above):
tt[tt,,allow.cartesian=TRUE][Date > Date.1, setNames(as.list(rbind(
sapply(1:n, function(i, .SD) sum(.SD==i), .SD=.SD),
sapply(1:n, function(i, .SD) mean(.SD==i), .SD=.SD)
)), c(rbind(Cnames, PCnames))),
by = list(Date, V1, V2), .SDcols = c("V1.1", "V2.1")]
2. sqldf
An alternative to data.table would be to use SQL with this similarly tedious but straight-forward self-join:
library(sqldf)
sqldf("select a.Date, a.V1, a.V2,
sum(((b.V1 = 1) + (b.V2 = 1)) * (a.Date > b.Date)) C1,
sum(((b.V1 = 1) + (b.V2 = 1)) * (a.Date > b.Date)) /
cast (2 * count(*) - 2 as real) PC1,
sum(((b.V1 = 2) + (b.V2 = 2)) * (a.Date > b.Date)) C2,
sum(((b.V1 = 2) + (b.V2 = 2)) * (a.Date > b.Date)) /
cast (2 * count(*) - 2 as real) PC2,
sum(((b.V1 = 3) + (b.V2 = 3)) * (a.Date > b.Date)) C3,
sum(((b.V1 = 3) + (b.V2 = 3)) * (a.Date > b.Date)) /
cast (2 * count(*) - 2 as real) PC3,
sum(((b.V1 = 4) + (b.V2 = 4)) * (a.Date > b.Date)) C4,
sum(((b.V1 = 4) + (b.V2 = 4)) * (a.Date > b.Date)) /
cast (2 * count(*) - 2 as real) PC4,
sum(((b.V1 = 5) + (b.V2 = 5)) * (a.Date > b.Date)) C5,
sum(((b.V1 = 5) + (b.V2 = 5)) * (a.Date > b.Date)) /
cast (2 * count(*) - 2 as real) PC5
from t a, t b where a.Date >= b.Date
group by a.Date")
2a. sqldf alternative
An alternative would be to use string manipulation to create the above sql string like this:
f <- function(i) {
s <- fn$identity("sum(((b.V1 = $i) + (b.V2 = $i)) * (a.Date > b.Date))")
fn$identity("$s C$i,\n $s /\ncast (2 * count(*) - 2 as real) PC$i")
}
s <- fn$identity("select a.Date, a.V1, a.V2, `toString(sapply(1:5, f))`
from t a, t b where a.Date >= b.Date
group by a.Date")
sqldf(s)
2b. second sqldf alternative
The sql solution can be simplified substantially if we are willing to do without an output row for the first date. This may make sense as the first date has no prior dates to tabulate:
sqldf("select a.Date, a.V1, a.V2,
sum((b.V1 = 1) + (b.V2 = 1)) C1,
avg((b.V1 = 1) + (b.V2 = 1)) PC1,
sum((b.V1 = 2) + (b.V2 = 2)) C2,
avg((b.V1 = 2) + (b.V2 = 2)) PC2,
sum((b.V1 = 3) + (b.V2 = 3)) C3,
avg((b.V1 = 3) + (b.V2 = 3)) PC3,
sum((b.V1 = 4) + (b.V2 = 4)) C4,
avg((b.V1 = 4) + (b.V2 = 4)) PC4,
sum((b.V1 = 5) + (b.V2 = 5)) C5,
avg((b.V1 = 5) + (b.V2 = 5)) PC5
from t a, t b where a.Date > b.Date
group by a.Date")
Again it would be possible to create the sql string to avoid repitition in the same manner as shown in the prior solution.
UPDATE: added PC columns and some simplifications
UPDATE 2: added additional solutions
回答2:
Here is a start. I don't see a reason to do this "all in one shot". It might be possible. Try yourself.
library(data.table)
DT = as.data.table(d)
DT[,i:=as.numeric(Date)]
setkey(DT,"i")
uv <- 1:max(unlist(DT[,2:3]))
DT[,paste0("C",uv):=lapply(uv,function(x) x %in% unlist(.SD)),.SDcols=2:3,by=i]
DT[,paste0("C",uv):=lapply(.SD,function(x) c(NA,head(cumsum(x),-1))),.SDcols=paste0("C",uv)]
DT[,paste0("PC",uv):=lapply(.SD,function(x) x/(2*.I-2)),.SDcols=paste0("C",uv)]
# Date V1 V2 i C1 C2 C3 C4 C5 PC1 PC2 PC3 PC4 PC5
# 1: 2013-05-04 4 2 15829 NA NA NA NA NA NA NA NA NA NA
# 2: 2013-05-09 2 5 15834 0 1 0 1 0 0 0.5 0.0000000 0.5000000 0.0000000
# 3: 2013-05-16 3 2 15841 0 2 0 1 1 0 0.5 0.0000000 0.2500000 0.2500000
# 4: 2013-05-30 1 4 15855 0 3 1 1 1 0 0.5 0.1666667 0.1666667 0.1666667
回答3:
You probably want the %in%
operator.
> foo<-sample(1:10,4)
> bar<-sample(1:10,3)
> foo
[1] 5 3 9 6
> bar
[1] 1 7 2
> bar2<-sample(1:10,5)
> bar2
[1] 2 9 4 8 5
> which(bar2%in%foo)
[1] 2 5 #those are the indices of the values in bar2 which appear in foo
> which(bar%in%foo)
integer(0)
来源:https://stackoverflow.com/questions/17125656/r-data-table-count-occurrences-prior-to-current-measurement