I\'ve been profiling an application all day long and, having optimized a couple bits of code, I\'m left with this on my todo list. It\'s the activation function for a neural
(Updated with performance measurements)(Updated again with real results :)
I think a lookup table solution would get you very far when it comes to performance, at a negligible memory and precision cost.
The following snippet is an example implementation in C (I don't speak c# fluently enough to dry-code it). It runs and performs well enough, but I'm sure there's a bug in it :)
#include
#include
#include
#define SCALE 320.0f
#define RESOLUTION 2047
#define MIN -RESOLUTION / SCALE
#define MAX RESOLUTION / SCALE
static float sigmoid_lut[RESOLUTION + 1];
void init_sigmoid_lut(void) {
int i;
for (i = 0; i < RESOLUTION + 1; i++) {
sigmoid_lut[i] = (1.0 / (1.0 + exp(-i / SCALE)));
}
}
static float sigmoid1(const float value) {
return (1.0f / (1.0f + expf(-value)));
}
static float sigmoid2(const float value) {
if (value <= MIN) return 0.0f;
if (value >= MAX) return 1.0f;
if (value >= 0) return sigmoid_lut[(int)(value * SCALE + 0.5f)];
return 1.0f-sigmoid_lut[(int)(-value * SCALE + 0.5f)];
}
float test_error() {
float x;
float emax = 0.0;
for (x = -10.0f; x < 10.0f; x+=0.00001f) {
float v0 = sigmoid1(x);
float v1 = sigmoid2(x);
float error = fabsf(v1 - v0);
if (error > emax) { emax = error; }
}
return emax;
}
int sigmoid1_perf() {
clock_t t0, t1;
int i;
float x, y = 0.0f;
t0 = clock();
for (i = 0; i < 10; i++) {
for (x = -5.0f; x <= 5.0f; x+=0.00001f) {
y = sigmoid1(x);
}
}
t1 = clock();
printf("", y); /* To avoid sigmoidX() calls being optimized away */
return (t1 - t0) / (CLOCKS_PER_SEC / 1000);
}
int sigmoid2_perf() {
clock_t t0, t1;
int i;
float x, y = 0.0f;
t0 = clock();
for (i = 0; i < 10; i++) {
for (x = -5.0f; x <= 5.0f; x+=0.00001f) {
y = sigmoid2(x);
}
}
t1 = clock();
printf("", y); /* To avoid sigmoidX() calls being optimized away */
return (t1 - t0) / (CLOCKS_PER_SEC / 1000);
}
int main(void) {
init_sigmoid_lut();
printf("Max deviation is %0.6f\n", test_error());
printf("10^7 iterations using sigmoid1: %d ms\n", sigmoid1_perf());
printf("10^7 iterations using sigmoid2: %d ms\n", sigmoid2_perf());
return 0;
}
Previous results were due to the optimizer doing its job and optimized away the calculations. Making it actually execute the code yields slightly different and much more interesting results (on my way slow MB Air):
$ gcc -O2 test.c -o test && ./test
Max deviation is 0.001664
10^7 iterations using sigmoid1: 571 ms
10^7 iterations using sigmoid2: 113 ms
TODO:
There are things to improve and ways to remove weaknesses; how to do is is left as an exercise to the reader :)