How do you go about subsetting a data.table by a numeric range, with the intention of using binary search?
For example:
require(data.table)
set.seed
As requested by Matt Dowle I re-ran the code and timings to include a comparison to the betweenfunction now included in the data.table package. It seems vector scanning a floating point column is still the most efficient approach.
#OP's example data
require(data.table)
set.seed(1)
x<-runif(10000000,min=0,max=10)
y<-runif(10000000,min=0,max=10)
DF<-data.frame(x,y)
DT<-data.table(x,y)
Subset as a data.frame
system.time(DFsub<-DF[DF$x>5 & DF$y<7,])
# user system elapsed
# 0.506 0.062 0.576
Subset as data.table with vector scanning
system.time(DTsub<-DT[x>5 & y<7])
# user system elapsed
# 0.213 0.024 0.238
Subset DT with between (for both x and y)
system.time(DTsub<-DT[between(x ,5, max(x)) & between(y, 0,7), ])
# user system elapsed
# 0.242 0.036 0.279
Alternative mixed vector scanning and between
system.time(DTsub<-DT[x > 5 & between(y, 0,7), ])
# user system elapsed
# 0.203 0.017 0.221
Alternative between syntax
system.time(DTsub<-DT[x %between% c(5, max(x)) & y %between% c(0, 7)])
# user system elapsed
# 0.227 0.016 0.244
Mixed vector scanning and between (with alternative syntax)
system.time(DTsub<-DT[x>5 & y %between% c(0, 7)])
# user system elapsed
# 0.203 0.017 0.221
Slightly more thorough evaluation
library(microbenchmark)
mbm<-microbenchmark(
"DFsub"={b1<-DF[DF$x>5 & DF$y<7,]},
"DTsub1"={b2<-DT[x>5 & y<7]},
"DTsub2"={b3<-DT[between(x ,5, max(x)) & between(y, 0, 7), ]},
"DTsub3"={b4<-DT[x > 5 & between(y, 0,7), ]},
"DTsub4"={b5<-DT[x %between% c(5, max(x)) & y %between% c(0, 7)]},
"DTsub5"={b5<-DT[x>5 & y %between% c(0, 7)]}
)
mbm
Unit: milliseconds
Unit: milliseconds
# expr min lq mean median uq max neval
# DFsub 527.6842 582.3235 635.8846 622.1641 664.3243 1101.2365 100
# DTsub1 220.5086 245.7509 279.5451 263.5527 296.5736 411.5833 100
# DTsub2 249.2093 283.2025 325.4845 304.2361 333.6894 660.5021 100
# DTsub3 215.5454 243.3255 281.3596 270.1108 300.8462 491.8837 100
# DTsub4 250.9431 282.1896 330.0688 305.2094 352.9604 736.2690 100
# DTsub5 218.5458 238.8931 276.7932 262.6675 293.3524 467.5082 100
library(ggplot2)
autoplot(mbm)