I am trying to use valarray since it is much like MATLAB while operating vector and matrices. I first did some performance check and found that valarray cannot achieve the performance declared as in the book C++ programming language by Stroustrup.
The test program actually did 5 million multiplication of doubles. I thought that c = a*b would at least be comparable to the for
loop double type element multiplication, but I am totally wrong. I tried on several computers and Microsoft Visual C++ 6.0 and Visual Studio 2008.
By the way, I tested on MATLAB using the following code:
len = 5*1024*1024;
a = rand(len, 1);
b = rand(len, 1);
c = zeros(len, 1);
tic;
c = a.*b;
toc;
And the result is 46 ms. This time is not high precision; it only works as a reference.
The code is:
#include <iostream>
#include <valarray>
#include <iostream>
#include "windows.h"
using namespace std;
SYSTEMTIME stime;
LARGE_INTEGER sys_freq;
double gettime_hp();
int main()
{
enum { N = 5*1024*1024 };
valarray<double> a(N), b(N), c(N);
QueryPerformanceFrequency(&sys_freq);
int i, j;
for (j=0 ; j<8 ; ++j)
{
for (i=0 ; i<N ; ++i)
{
a[i] = rand();
b[i] = rand();
}
double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
double dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c1[i] = a1[i] * b1[i];
dtime = gettime_hp()-dtime;
cout << "double operator* " << dtime << " ms\n";
dtime = gettime_hp();
c = a*b ;
dtime = gettime_hp() - dtime;
cout << "valarray operator* " << dtime << " ms\n";
dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c[i] = a[i] * b[i];
dtime = gettime_hp() - dtime;
cout << "valarray[i] operator* " << dtime<< " ms\n";
cout << "------------------------------------------------------\n";
}
}
double gettime_hp()
{
LARGE_INTEGER tick;
extern LARGE_INTEGER sys_freq;
QueryPerformanceCounter(&tick);
return (double)tick.QuadPart * 1000.0 / sys_freq.QuadPart;
}
The running results: (release mode with maximal speed optimization)
double operator* 52.3019 ms
valarray operator* 128.338 ms
valarray[i] operator* 43.1801 ms
------------------------------------------------------
double operator* 43.4036 ms
valarray operator* 145.533 ms
valarray[i] operator* 44.9121 ms
------------------------------------------------------
double operator* 43.2619 ms
valarray operator* 158.681 ms
valarray[i] operator* 43.4871 ms
------------------------------------------------------
double operator* 42.7317 ms
valarray operator* 173.164 ms
valarray[i] operator* 80.1004 ms
------------------------------------------------------
double operator* 43.2236 ms
valarray operator* 158.004 ms
valarray[i] operator* 44.3813 ms
------------------------------------------------------
Debugging mode with same optimization:
double operator* 41.8123 ms
valarray operator* 201.484 ms
valarray[i] operator* 41.5452 ms
------------------------------------------------------
double operator* 40.2238 ms
valarray operator* 215.351 ms
valarray[i] operator* 40.2076 ms
------------------------------------------------------
double operator* 40.5859 ms
valarray operator* 232.007 ms
valarray[i] operator* 40.8803 ms
------------------------------------------------------
double operator* 40.9734 ms
valarray operator* 234.325 ms
valarray[i] operator* 40.9711 ms
------------------------------------------------------
double operator* 41.1977 ms
valarray operator* 234.409 ms
valarray[i] operator* 41.1429 ms
------------------------------------------------------
double operator* 39.7754 ms
valarray operator* 234.26 ms
valarray[i] operator* 39.6338 ms
------------------------------------------------------
I suspect that the reason c = a*b
is so much slower than performing the operations an element at a time is that the
template<class T> valarray<T> operator*
(const valarray<T>&, const valarray<T>&);
operator must allocate memory to put the result into, then returns that by value.
Even if a "swaptimization" is used to perform the copy, that function still has the overhead of
- allocating the new block for the resulting
valarray
- initializing the new
valarray
(it's possible that this might be optimized away) - putting the results into the new
valarray
- paging in the memory for the new
valarray
as it is initialized or set with result values - deallocating the old
valarray
that gets replaced by the result
I just tried it on a Linux x86-64 system (Sandy Bridge CPU):
gcc 4.5.0:
double operator* 9.64185 ms
valarray operator* 9.36987 ms
valarray[i] operator* 9.35815 ms
Intel ICC 12.0.2:
double operator* 7.76757 ms
valarray operator* 9.60208 ms
valarray[i] operator* 7.51409 ms
In both cases I just used -O3
and no other optimisation-related flags.
It looks like the MS C++ compiler and/or valarray implementation suck.
Here's the OP's code modified for Linux:
#include <iostream>
#include <valarray>
#include <iostream>
#include <ctime>
using namespace std ;
double gettime_hp();
int main()
{
enum { N = 5*1024*1024 };
valarray<double> a(N), b(N), c(N) ;
int i,j;
for( j=0 ; j<8 ; ++j )
{
for( i=0 ; i<N ; ++i )
{
a[i]=rand();
b[i]=rand();
}
double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0] ;
double dtime=gettime_hp();
for( i=0 ; i<N ; ++i ) c1[i] = a1[i] * b1[i] ;
dtime=gettime_hp()-dtime;
cout << "double operator* " << dtime << " ms\n" ;
dtime=gettime_hp();
c = a*b ;
dtime=gettime_hp()-dtime;
cout << "valarray operator* " << dtime << " ms\n" ;
dtime=gettime_hp();
for( i=0 ; i<N ; ++i ) c[i] = a[i] * b[i] ;
dtime=gettime_hp()-dtime;
cout << "valarray[i] operator* " << dtime<< " ms\n" ;
cout << "------------------------------------------------------\n" ;
}
}
double gettime_hp()
{
struct timespec timestamp;
clock_gettime(CLOCK_REALTIME, ×tamp);
return timestamp.tv_sec * 1000.0 + timestamp.tv_nsec * 1.0e-6;
}
The whole point of valarray is to be fast on vector machines, which x86 machines just aren't.
A good implementation on a nonvector machine should be able to match the performance that you get with something like
for (i=0; i < N; ++i)
c1[i] = a1[i] * b1[i];
and a bad one of course won't. Unless there is something in the hardware to expedite parallel processing, that is going to be pretty close to the best that you can do.
I finally got this through using delayed evaluation. The code may be ugly since I am just starting learning these C++ advanced concepts.
Here is the code:
#include <iostream>
#include <valarray>
#include <iostream>
#include "windows.h"
using namespace std;
SYSTEMTIME stime;
LARGE_INTEGER sys_freq;
double gettime_hp();
// To improve the c = a*b (it will generate a temporary first, assigned to 'c' and delete the temporary.
// Which causes the program really slow
// The solution is the expression template and let the compiler to decide when all the expression is known.
// Delayed evaluation
//typedef valarray<double> Vector;
class Vector;
class VecMul
{
public:
const Vector& va;
const Vector& vb;
//Vector& vc;
VecMul(const Vector& v1, const Vector& v2): va(v1), vb(v2) {}
operator Vector();
};
class Vector:public valarray<double>
{
valarray<double> *p;
public:
explicit Vector(int n)
{
p = new valarray<double>(n);
}
Vector& operator = (const VecMul &m)
{
for(int i=0; i<m.va.size(); i++)
(*p)[i] = (m.va)[i]*(m.vb)[i]; // Ambiguous
return *this;
}
double& operator[](int i) const {return (*p)[i];} //const vector_type[i]
int size()const {return (*p).size();}
};
inline VecMul operator*(const Vector& v1, const Vector& v2)
{
return VecMul(v1, v2);
}
int main()
{
enum {N = 5*1024*1024};
Vector a(N), b(N), c(N);
QueryPerformanceFrequency(&sys_freq);
int i, j;
for (j=0 ; j<8 ; ++j)
{
for (i=0 ; i<N ; ++i)
{
a[i] = rand();
b[i] = rand();
}
double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
double dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c1[i] = a1[i] * b1[i];
dtime = gettime_hp()-dtime;
cout << "double operator* " << dtime << " ms\n";
dtime = gettime_hp();
c = a*b;
dtime = gettime_hp()-dtime;
cout << "valarray operator* " << dtime << " ms\n";
dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c[i] = a[i] * b[i];
dtime = gettime_hp() - dtime;
cout << "valarray[i] operator* " << dtime << " ms\n";
cout << "------------------------------------------------------\n";
}
}
double gettime_hp()
{
LARGE_INTEGER tick;
extern LARGE_INTEGER sys_freq;
QueryPerformanceCounter(&tick);
return (double)tick.QuadPart*1000.0/sys_freq.QuadPart;
}
The running result on Visual studio is:
double operator* 41.2031 ms
valarray operator* 43.8407 ms
valarray[i] operator* 42.49 ms
I'm compiling in release x64, Visual Studio 2010. I changed your code very slightly:
double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0];
double dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
a1[i] *= b1[i];
dtime = gettime_hp() - dtime;
cout << "double operator* " << dtime << " ms\n";
dtime = gettime_hp();
a *= b;
dtime = gettime_hp() - dtime;
cout << "valarray operator* " << dtime << " ms\n";
dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
a[i] *= b[i];
dtime = gettime_hp() - dtime;
cout << "valarray[i] operator* " << dtime<< " ms\n";
cout << "------------------------------------------------------\n" ;
Here you can see that I used *= instead of c = a * b
. In more modern mathematical libraries, very complex expression template mechanisms are used that eliminate this problem. In this case, I actually got very slightly faster results from valarray, although that's probably just because the contents were already in a cache. The overhead that you are seeing is simply redundant temporaries and nothing intrinsic to valarray, specifically- you'd see the same behaviour with something like std::string
.
I think Michael Burr's reply is right. And maybe you can create a virtual type as the type the return value of operator +
, and reload another operator=
for this virtual type like operator=(virtual type& v){&valarray=&v;v=NULL;}
(roughly speaking).
Of course, it is difficult to implement the idea on valarray. But when you create a new class, you can try this idea. And then, the efficiency for operator+
is almost the same as operator+=
.
Hmm..I tested Blitz++ and it's same as valarray... And moreover, the Blitz++ []
operator is very slow.
#include <blitz/array.h>
#include <iostream>
#ifdef WIN32
#include "windows.h"
LARGE_INTEGER sys_freq;
#endif
#ifdef LINUX
<ctime>
#endif
using namespace std;
SYSTEMTIME stime;
__forceinline double gettime_hp();
double gettime_hp()
{
#ifdef WIN32
LARGE_INTEGER tick;
extern LARGE_INTEGER sys_freq;
QueryPerformanceCounter(&tick);
return (double)tick.QuadPart * 1000.0 / sys_freq.QuadPart;
#endif
#ifdef LINUX
struct timespec timestamp;
clock_gettime(CLOCK_REALTIME, ×tamp);
return timestamp.tv_sec * 1000.0 + timestamp.tv_nsec * 1.0e-6;
#endif
}
BZ_USING_NAMESPACE(blitz)
int main()
{
int N = 5*1024*1024;
// Create three-dimensional arrays of double
Array<double, 1> a(N), b(N), c(N);
int i, j;
#ifdef WIN32
QueryPerformanceFrequency(&sys_freq);
#endif
for (j=0 ; j<8 ; ++j)
{
for (i=0 ; i<N ; ++i)
{
a[i] = rand();
b[i] = rand();
}
double* a1 = a.data(), *b1 = b.data(), *c1 = c.data();
double dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c1[i] = a1[i] * b1[i];
dtime = gettime_hp() - dtime;
cout << "double operator* " << dtime << " ms\n";
dtime = gettime_hp();
c = a*b;
dtime = gettime_hp() - dtime;
cout << "blitz operator* " << dtime << " ms\n";
dtime = gettime_hp();
for (i=0 ; i<N ; ++i)
c[i] = a[i] * b[i];
dtime = gettime_hp() - dtime;
cout << "blitz[i] operator* " << dtime<< " ms\n";
cout << "------------------------------------------------------\n";
}
}
来源:https://stackoverflow.com/questions/6850807/why-is-valarray-so-slow