I have data like this:
library(data.table)
id <- c(\"1232\",\"1232\",\"1232\",\"4211\",\"4211\",\"4211\")
conversion <- c(0,0,0,1,1,1)
DT <- data.ta
Timings for reference:
library(data.table)
#data.table 1.12.3 IN DEVELOPMENT built 2019-05-12 17:04:48 UTC; root using 4 threads (see ?getDTthreads). Latest news: r-datatable.com
set.seed(0L)
nid <- 3e6L
DT <- data.table(id=rep(1L:nid, each=3L))[,
conversion := sample(c(0L,1L), 1L, replace=TRUE), by=.(id)]
DT0 <- copy(DT)
DT1 <- copy(DT)
DT2 <- copy(DT)
DT3 <- copy(DT)
mtd0 <- function() {
DT0[DT0[, .I[.N], by=id]$V1, lastconv := conversion]
DT0[is.na(lastconv), lastconv := 0L]
}
mtd1 <- function() {
DT1[DT1[, .I[.N], by=id]$V1, lastconv := conversion]
setnafill(DT1, cols = "lastconv", fill = 0L)
}
mtd2 <- function() {
DT2[, v := 0]
DT2[.(DT2[conversion == 1, unique(id)]), on=.(id), mult="last", v := 1]
#or also
#DT2[, v := 0L][
# DT2[,.(cv=last(conversion)), id], on=.(id), mult="last", v := cv]
}
mtd3 <- function() {
DT3[ , lastconv := as.integer(.I == .I[.N] & conversion == 1), by = id]
}
library(microbenchmark)
microbenchmark(mtd0(), mtd1(), mtd2(), mtd3(), times=1L)
timings:
Unit: milliseconds
expr min lq mean median uq max neval cld
mtd0() 1363.1783 1416.1867 1468.9256 1469.1952 1521.7992 1574.4033 3 b
mtd1() 1349.5333 1365.4653 1378.9350 1381.3974 1393.6358 1405.8743 3 b
mtd2() 511.5615 515.4728 552.9133 519.3841 573.5892 627.7944 3 a
mtd3() 3966.8867 4009.1128 4048.9607 4051.3389 4089.9977 4128.6564 3 c
For each id, check if row number is the last row number in the group, and if 'conversion' is 1. Convert logical result to integer.
DT[ , lastconv := as.integer(.I == .I[.N] & conversion == 1), by = id]
Modifying the OP's code to join on the last row of each group:
DT[, v := 0]
DT[.(DT[conversion == 1, unique(id)]), on=.(id), mult="last", v := 1]
id conversion v
1: 1232 0 0
2: 1232 0 0
3: 1232 0 0
4: 4211 1 0
5: 4211 1 0
6: 4211 1 1
This is only different in that it selects which ids to edit based on the desired condition.
Filter for the last row per group and set lastconv equal to conversion.
DT[DT[, .I[.N], by=id]$V1, lastconv := conversion]
Then replace NAs with 0
DT[is.na(lastconv), lastconv := 0L]
Result
DT
# id conversion lastconv
#1: 1232 0 0
#2: 1232 0 0
#3: 1232 0 0
#4: 4211 1 0
#5: 4211 1 0
#6: 4211 1 1
If data.table v1.12.3 is installed we could also use the new function setnafill to replace NAs in the second step
DT[DT[, .I[.N], by=id]$V1, lastconv := conversion]
setnafill(DT, cols = "lastconv", fill = 0L)
Have you tried something like the following?
library(tidyverse)
final_conversion_dat <- DT %>%
group_by(id) %>%
mutate(date = as.Date(date),
final_conversion = ifelse(date == max(date, na.rm = T) & conversion == 1, 1, 0))