问题
Is there any function in C++ equivalent to %in%
operator in R? Consider the command below in R:
which(y %in% x)
I tried to find something equivalent in C++ (specifically in Armadillo) and I couldn't find anything. I then wrote my own function which is very slow compared to the R command above.
Here is what I wrote:
#include <RcppArmadillo.h>
// [[Rcpp::depends("RcppArmadillo")]]
// [[Rcpp::export]]
arma::uvec myInOperator(arma::vec myBigVec, arma::vec mySmallVec ){
arma::uvec rslt = find(myBigVec == mySmallVec[0]);
for (int i = 1; i < mySmallVec.size(); i++){
arma::uvec rslt_tmp = find(myBigVec == mySmallVec[i]);
rslt = arma::unique(join_cols( rslt, rslt_tmp ));
}
return rslt;
}
Now after sourcing in the code above, we have:
x <- 1:4
y <- 1:10
res <- benchmark(myInOperator(y, x), which(y %in% x), columns = c("test",
"replications", "elapsed", "relative", "user.self", "sys.self"),
order = "relative")
And here are the results:
test replications elapsed relative user.self sys.self
2 which(y %in% x) 100 0.001 1 0.001 0
1 myInOperator(y, x) 100 0.002 2 0.001 0
Could anyone guide me either on finding a C++ code corresponding to which(y %in% x) or on making my code more efficient? The elapsed time is already very small for both functions. I guess what I meant by efficiency is more from programming perspective and on whether the way I thought about the problem and the commands I used are efficient.
I appreciate your help.
回答1:
EDIT: Thanks to @MatthewLundberg and @Yakk for catching my silly errors.
If what you really want is just faster matching, you should check out Simon Urbanek's fastmatch package. However, Rcpp
does in fact have a sugar in
function which can be used here. in
uses some of the ideas from the fastmatch
package and incorporates them into Rcpp
. I also compare @hadley's solution here.
// [[Rcpp::plugins("cpp11")]]
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
std::vector<int> sugar_in(IntegerVector x, IntegerVector y) {
LogicalVector ind = in(x, y);
int n = ind.size();
std::vector<int> output;
output.reserve(n);
for (int i=0; i < n; ++i) {
if (ind[i]) output.push_back(i+1);
}
return output;
}
// [[Rcpp::export]]
std::vector<int> which_in(IntegerVector x, IntegerVector y) {
int nx = x.size();
std::unordered_set<int> z(y.begin(), y.end());
std::vector<int> output;
output.reserve(nx);
for (int i=0; i < nx; ++i) {
if (z.find( x[i] ) != z.end() ) {
output.push_back(i+1);
}
}
return output;
}
// [[Rcpp::export]]
std::vector<int> which_in2(IntegerVector x, IntegerVector y) {
std::vector<int> y_sort(y.size());
std::partial_sort_copy (y.begin(), y.end(), y_sort.begin(), y_sort.end());
int nx = x.size();
std::vector<int> out;
for (int i = 0; i < nx; ++i) {
std::vector<int>::iterator found =
lower_bound(y_sort.begin(), y_sort.end(), x[i]);
if (found != y_sort.end()) {
out.push_back(i + 1);
}
}
return out;
}
/*** R
set.seed(123)
library(microbenchmark)
x <- sample(1:100)
y <- sample(1:10000, 1000)
identical( sugar_in(y, x), which(y %in% x) )
identical( which_in(y, x), which(y %in% x) )
identical( which_in2(y, x), which(y %in% x) )
microbenchmark(
sugar_in(y, x),
which_in(y, x),
which_in2(y, x),
which(y %in% x)
)
*/
Calling sourceCpp
on this gives me, from the benchmark,
Unit: microseconds
expr min lq median uq max neval
sugar_in(y, x) 7.590 10.0795 11.4825 14.3630 32.753 100
which_in(y, x) 40.757 42.4460 43.4400 46.8240 63.690 100
which_in2(y, x) 14.325 15.2365 16.7005 17.2620 30.580 100
which(y %in% x) 17.070 21.6145 23.7070 29.0105 78.009 100
回答2:
For this set of inputs we can eke out a little more performance by using an approach that technically has a higher algorithmic complexity (O(ln n) vs O(1) for each lookup) but has lower constants: a binary search.
// [[Rcpp::plugins("cpp11")]]
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
std::vector<int> which_in(IntegerVector x, IntegerVector y) {
int nx = x.size();
std::unordered_set<int> z(y.begin(), y.end());
std::vector<int> output;
output.reserve(nx);
for (int i=0; i < nx; ++i) {
if (z.find( x[i] ) != z.end() ) {
output.push_back(i+1);
}
}
return output;
}
// [[Rcpp::export]]
std::vector<int> which_in2(IntegerVector x, IntegerVector y) {
std::vector<int> y_sort(y.size());
std::partial_sort_copy (y.begin(), y.end(), y_sort.begin(), y_sort.end());
int nx = x.size();
std::vector<int> out;
for (int i = 0; i < nx; ++i) {
std::vector<int>::iterator found =
lower_bound(y_sort.begin(), y_sort.end(), x[i]);
if (found != y_sort.end()) {
out.push_back(i + 1);
}
}
return out;
}
/*** R
set.seed(123)
library(microbenchmark)
x <- sample(1:100)
y <- sample(1:10000, 1000)
identical( which_in(y, x), which(y %in% x) )
identical( which_in2(y, x), which(y %in% x) )
microbenchmark(
which_in(y, x),
which_in2(y, x),
which(y %in% x)
)
*/
On my computer that yields
Unit: microseconds
expr min lq median uq max neval
which_in(y, x) 39.3 41.0 42.7 44.0 81.5 100
which_in2(y, x) 12.8 13.6 14.4 15.0 23.8 100
which(y %in% x) 16.8 20.2 21.0 21.9 31.1 100
so about 30% better than base R.
来源:https://stackoverflow.com/questions/21359432/a-c-version-of-the-in-operator-in-r