I\'m looking for an efficient (Fast) approximation of the exponential function operating on AVX elements (Single Precision Floating Point). Namely - __m256 _mm256_exp_
I played a lot with this, and discovered this one, that has relative accuracy about ~1-07e and simple to convert to vector instructions. Having only 4 constants, 5 multiplications and 1 division this is twice as fast as built-in exp() function.
float fast_exp(float x)
{
const float c1 = 0.007972914726F;
const float c2 = 0.1385283768F;
const float c3 = 2.885390043F;
const float c4 = 1.442695022F;
x *= c4; //convert to 2^(x)
int intPart = (int)x;
x -= intPart;
float xx = x * x;
float a = x + c1 * xx * x;
float b = c3 + c2 * xx;
float res = (b + a) / (b - a);
reinterpret_cast(res) += intPart << 23; // res *= 2^(intPart)
return res;
}
Converting to AVX (updated)
__m256 _mm256_exp_ps(__m256 _x)
{
__m256 c1 = _mm256_set1_ps(0.007972914726F);
__m256 c2 = _mm256_set1_ps(0.1385283768F);
__m256 c3 = _mm256_set1_ps(2.885390043F);
__m256 c4 = _mm256_set1_ps(1.442695022F);
__m256 x = _mm256_mul_ps(_x, c4); //convert to 2^(x)
__m256 intPartf = _mm256_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
x = _mm256_sub_ps(x, intPartf);
__m256 xx = _mm256_mul_ps(x, x);
__m256 a = _mm256_add_ps(x, _mm256_mul_ps(c1, _mm256_mul_ps(xx, x))); //can be improved with FMA
__m256 b = _mm256_add_ps(c3, _mm256_mul_ps(c2, xx));
__m256 res = _mm256_div_ps(_mm256_add_ps(b, a), _mm256_sub_ps(b, a));
__m256i intPart = _mm256_cvtps_epi32(intPartf); //res = 2^intPart. Can be improved with AVX2!
__m128i ii0 = _mm_slli_epi32(_mm256_castsi256_si128(intPart), 23);
__m128i ii1 = _mm_slli_epi32(_mm256_extractf128_si256(intPart, 1), 23);
__m128i res_0 = _mm_add_epi32(ii0, _mm256_castsi256_si128(_mm256_castps_si256(res)));
__m128i res_1 = _mm_add_epi32(ii1, _mm256_extractf128_si256(_mm256_castps_si256(res), 1));
return _mm256_insertf128_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(res_0)), _mm_castsi128_ps(res_1), 1);
}