Fastest implementation of sine, cosine and square root in C++ (doesn't need to be much accurate)

前端 未结 15 2034
执笔经年
执笔经年 2020-12-04 10:55

I am googling the question for past hour, but there are only points to Taylor Series or some sample code that is either too slow or does not compile at all. Well, most answe

15条回答
  •  余生分开走
    2020-12-04 11:44

    So let me rephrase that, this idea comes from approximating the cosine & sine functions on an interval [-pi/4,+pi/4] with a bounded error using the Remez algorithm. Then using the range reduced float remainder and a LUT for the outputs cos & sine of the integer quotient, the approximation can be moved to any angular argument.

    Its just unique and I thought it could be expanded on to make a more efficient algorithm in terms of a bounded error.

    void sincos_fast(float x, float *pS, float *pC){
        float cosOff4LUT[] = { 0x1.000000p+00,  0x1.6A09E6p-01,  0x0.000000p+00, -0x1.6A09E6p-01, -0x1.000000p+00, -0x1.6A09E6p-01,  0x0.000000p+00,  0x1.6A09E6p-01 };
    
        int     m, ms, mc;
        float   xI, xR, xR2;
        float   c, s, cy, sy;
    
        // Cody & Waite's range reduction Algorithm, [-pi/4, pi/4]
        xI  = floorf(x * 0x1.45F306p+00 + 0.5);              // This is 4/pi.
        xR  = (x - xI * 0x1.920000p-01) - xI*0x1.FB5444p-13; // This is pi/4 in two parts per C&W.
        m   = (int) xI;
        xR2 = xR*xR;
    
        // Find cosine & sine index for angle offsets indices
        mc = (  m  ) & 0x7;     // two's complement permits upper modulus for negative numbers =P
        ms = (m + 6) & 0x7;     // phase correction for sine.
    
        // Find cosine & sine
        cy = cosOff4LUT[mc];     // Load angle offset neighborhood cosine value 
        sy = cosOff4LUT[ms];     // Load angle offset neighborhood sine value 
    
        c = 0xf.ff79fp-4 + xR2 * (-0x7.e58e9p-4);               // TOL = 1.2786e-4
        // c = 0xf.ffffdp-4 + xR2 * (-0x7.ffebep-4 + xR2 * 0xa.956a9p-8);  // TOL = 1.7882e-7
    
        s = xR * (0xf.ffbf7p-4 + xR2 * (-0x2.a41d0cp-4));   // TOL = 4.835251e-6
        // s = xR * (0xf.fffffp-4 + xR2 * (-0x2.aaa65cp-4 + xR2 * 0x2.1ea25p-8));  // TOL = 1.1841e-8
    
        *pC = c*cy - s*sy;      
        *pS = c*sy + s*cy;
    }
    
    float sqrt_fast(float x){
        union {float f; int i; } X, Y;
        float ScOff;
        uint8_t e;
    
        X.f = x;
        e = (X.i >> 23);           // f.SFPbits.e;
    
        if(x <= 0) return(0.0f);
    
        ScOff = ((e & 1) != 0) ? 1.0f : 0x1.6a09e6p0;  // NOTE: If exp=EVEN, b/c (exp-127) a (EVEN - ODD) := ODD; but a (ODD - ODD) := EVEN!!
    
        e = ((e + 127) >> 1);                            // NOTE: If exp=ODD,  b/c (exp-127) then flr((exp-127)/2)
        X.i = (X.i & ((1uL << 23) - 1)) | (0x7F << 23);  // Mask mantissa, force exponent to zero.
        Y.i = (((uint32_t) e) << 23);
    
        // Error grows with square root of the exponent. Unfortunately no work around like inverse square root... :(
        // Y.f *= ScOff * (0x9.5f61ap-4 + X.f*(0x6.a09e68p-4));        // Error = +-1.78e-2 * 2^(flr(log2(x)/2))
        // Y.f *= ScOff * (0x7.2181d8p-4 + X.f*(0xa.05406p-4 + X.f*(-0x1.23a14cp-4)));      // Error = +-7.64e-5 * 2^(flr(log2(x)/2))
        // Y.f *= ScOff * (0x5.f10e7p-4 + X.f*(0xc.8f2p-4 +X.f*(-0x2.e41a4cp-4 + X.f*(0x6.441e6p-8))));     // Error =  8.21e-5 * 2^(flr(log2(x)/2))
        // Y.f *= ScOff * (0x5.32eb88p-4 + X.f*(0xe.abbf5p-4 + X.f*(-0x5.18ee2p-4 + X.f*(0x1.655efp-4 + X.f*(-0x2.b11518p-8)))));   // Error = +-9.92e-6 * 2^(flr(log2(x)/2))
        // Y.f *= ScOff * (0x4.adde5p-4 + X.f*(0x1.08448cp0 + X.f*(-0x7.ae1248p-4 + X.f*(0x3.2cf7a8p-4 + X.f*(-0xc.5c1e2p-8 + X.f*(0x1.4b6dp-8))))));   // Error = +-1.38e-6 * 2^(flr(log2(x)/2))
        // Y.f *= ScOff * (0x4.4a17fp-4 + X.f*(0x1.22d44p0 + X.f*(-0xa.972e8p-4 + X.f*(0x5.dd53fp-4 + X.f*(-0x2.273c08p-4 + X.f*(0x7.466cb8p-8 + X.f*(-0xa.ac00ep-12)))))));    // Error = +-2.9e-7 * 2^(flr(log2(x)/2))
        Y.f *= ScOff * (0x3.fbb3e8p-4 + X.f*(0x1.3b2a3cp0 + X.f*(-0xd.cbb39p-4 + X.f*(0x9.9444ep-4 + X.f*(-0x4.b5ea38p-4 + X.f*(0x1.802f9ep-4 + X.f*(-0x4.6f0adp-8 + X.f*(0x5.c24a28p-12 ))))))));   // Error = +-2.7e-6 * 2^(flr(log2(x)/2))
    
        return(Y.f);
    }
    

    The longer expressions are longer, slower, but more precise. Polynomials are written per Horner's rule.

提交回复
热议问题