Fastest Implementation of Exponential Function Using AVX

前端 未结 4 1190
忘掉有多难
忘掉有多难 2020-11-29 08:42

I\'m looking for an efficient (Fast) approximation of the exponential function operating on AVX elements (Single Precision Floating Point). Namely - __m256 _mm256_exp_

4条回答
  •  广开言路
    2020-11-29 09:25

    I played a lot with this, and discovered this one, that has relative accuracy about ~1-07e and simple to convert to vector instructions. Having only 4 constants, 5 multiplications and 1 division this is twice as fast as built-in exp() function.

    float fast_exp(float x)
    {
        const float c1 = 0.007972914726F;
        const float c2 = 0.1385283768F;
        const float c3 = 2.885390043F;
        const float c4 = 1.442695022F;      
        x *= c4; //convert to 2^(x)
        int intPart = (int)x;
        x -= intPart;
        float xx = x * x;
        float a = x + c1 * xx * x;
        float b = c3 + c2 * xx;
        float res = (b + a) / (b - a);
        reinterpret_cast(res) += intPart << 23; // res *= 2^(intPart)
        return res;
    }
    

    Converting to AVX (updated)

    __m256 _mm256_exp_ps(__m256 _x)
    {
        __m256 c1 = _mm256_set1_ps(0.007972914726F);
        __m256 c2 = _mm256_set1_ps(0.1385283768F);
        __m256 c3 = _mm256_set1_ps(2.885390043F);
        __m256 c4 = _mm256_set1_ps(1.442695022F);
        __m256 x = _mm256_mul_ps(_x, c4); //convert to 2^(x)
        __m256 intPartf = _mm256_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
        x = _mm256_sub_ps(x, intPartf);
        __m256 xx = _mm256_mul_ps(x, x);
        __m256 a = _mm256_add_ps(x, _mm256_mul_ps(c1, _mm256_mul_ps(xx, x))); //can be improved with FMA
        __m256 b = _mm256_add_ps(c3, _mm256_mul_ps(c2, xx));
        __m256 res = _mm256_div_ps(_mm256_add_ps(b, a), _mm256_sub_ps(b, a));
        __m256i intPart = _mm256_cvtps_epi32(intPartf); //res = 2^intPart. Can be improved with AVX2!
        __m128i ii0 = _mm_slli_epi32(_mm256_castsi256_si128(intPart), 23);
        __m128i ii1 = _mm_slli_epi32(_mm256_extractf128_si256(intPart, 1), 23);     
        __m128i res_0 = _mm_add_epi32(ii0, _mm256_castsi256_si128(_mm256_castps_si256(res)));
        __m128i res_1 = _mm_add_epi32(ii1, _mm256_extractf128_si256(_mm256_castps_si256(res), 1));
        return _mm256_insertf128_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(res_0)), _mm_castsi128_ps(res_1), 1);
    }
    

提交回复
热议问题