A lot of literature talks about using inline functions to \"avoid the overhead of a function call\". However I haven\'t seen quantifiable data. What is the actual overhead o
There is not much overhead at all, especially with small (inline-able) functions or even classes.
The following example has three different tests that are each run many, many times and timed. The results are always equal to the order of a couple 1000ths of a unit of time.
#include
#include
#include
double sum;
double a = 42, b = 53;
//#define ITERATIONS 1000000 // 1 million - for testing
//#define ITERATIONS 10000000000 // 10 billion ~ 10s per run
//#define WORK_UNIT sum += a + b
/* output
8.609619s wall, 8.611255s user + 0.000000s system = 8.611255s CPU(100.0%)
8.604478s wall, 8.611255s user + 0.000000s system = 8.611255s CPU(100.1%)
8.610679s wall, 8.595655s user + 0.000000s system = 8.595655s CPU(99.8%)
9.5e+011 9.5e+011 9.5e+011
*/
#define ITERATIONS 100000000 // 100 million ~ 10s per run
#define WORK_UNIT sum += std::sqrt(a*a + b*b + sum) + std::sin(sum) + std::cos(sum)
/* output
8.485689s wall, 8.486454s user + 0.000000s system = 8.486454s CPU (100.0%)
8.494153s wall, 8.486454s user + 0.000000s system = 8.486454s CPU (99.9%)
8.467291s wall, 8.470854s user + 0.000000s system = 8.470854s CPU (100.0%)
2.50001e+015 2.50001e+015 2.50001e+015
*/
// ------------------------------
double simple()
{
sum = 0;
boost::timer::auto_cpu_timer t;
for (unsigned long long i = 0; i < ITERATIONS; i++)
{
WORK_UNIT;
}
return sum;
}
// ------------------------------
void call6()
{
WORK_UNIT;
}
void call5(){ call6(); }
void call4(){ call5(); }
void call3(){ call4(); }
void call2(){ call3(); }
void call1(){ call2(); }
double calls()
{
sum = 0;
boost::timer::auto_cpu_timer t;
for (unsigned long long i = 0; i < ITERATIONS; i++)
{
call1();
}
return sum;
}
// ------------------------------
class Obj3{
public:
void runIt(){
WORK_UNIT;
}
};
class Obj2{
public:
Obj2(){it = new Obj3();}
~Obj2(){delete it;}
void runIt(){it->runIt();}
Obj3* it;
};
class Obj1{
public:
void runIt(){it.runIt();}
Obj2 it;
};
double objects()
{
sum = 0;
Obj1 obj;
boost::timer::auto_cpu_timer t;
for (unsigned long long i = 0; i < ITERATIONS; i++)
{
obj.runIt();
}
return sum;
}
// ------------------------------
int main(int argc, char** argv)
{
double ssum = 0;
double csum = 0;
double osum = 0;
ssum = simple();
csum = calls();
osum = objects();
std::cout << ssum << " " << csum << " " << osum << std::endl;
}
The output for running 10,000,000 iterations (of each type: simple, six function calls, three object calls) was with this semi-convoluted work payload:
sum += std::sqrt(a*a + b*b + sum) + std::sin(sum) + std::cos(sum)
as follows:
8.485689s wall, 8.486454s user + 0.000000s system = 8.486454s CPU (100.0%)
8.494153s wall, 8.486454s user + 0.000000s system = 8.486454s CPU (99.9%)
8.467291s wall, 8.470854s user + 0.000000s system = 8.470854s CPU (100.0%)
2.50001e+015 2.50001e+015 2.50001e+015
Using a simple work payload of
sum += a + b
Gives the same results except a couple orders of magnitude faster for each case.