Convert extended precision float (80-bit) to double (64-bit) in MSVC

前端 未结 4 2215
暖寄归人
暖寄归人 2020-12-15 13:25

What is the most portable and \"right\" way to do conversion from extended precision float (80-bit value, also known as long double in some compilers) to double

相关标签:
4条回答
  • 2020-12-15 13:59

    I've just written this one. It constructs an IEEE double number from IEEE extended precision number using bit operations. It takes the 10 byte extended precision number in little endian format:

    typedef unsigned long long uint64;
    
    double makeDoubleFromExtended(const unsigned char x[10])
    {
        int exponent = (((x[9] << 8) | x[8]) & 0x7FFF);
        uint64 mantissa =
            ((uint64)x[7] << 56) | ((uint64)x[6] << 48) | ((uint64)x[5] << 40) | ((uint64)x[4] << 32) | 
            ((uint64)x[3] << 24) | ((uint64)x[2] << 16) | ((uint64)x[1] << 8) | (uint64)x[0];
        unsigned char d[8] = {0};
        double result;
    
        d[7] = x[9] & 0x80; /* Set sign. */
    
        if ((exponent == 0x7FFF) || (exponent == 0))
        {
            /* Infinite, NaN or denormal */
            if (exponent == 0x7FFF)
            {
                /* Infinite or NaN */
                d[7] |= 0x7F;
                d[6] = 0xF0;
            }
            else
            {
                /* Otherwise it's denormal. It cannot be represented as double. Translate as singed zero. */
                memcpy(&result, d, 8);
                return result;
            }
        }
        else
        {
            /* Normal number. */
            exponent = exponent - 0x3FFF + 0x03FF; /*< exponent for double precision. */
    
            if (exponent <= -52)  /*< Too small to represent. Translate as (signed) zero. */
            {
                memcpy(&result, d, 8);
                return result;
            }
            else if (exponent < 0)
            {
                /* Denormal, exponent bits are already zero here. */
            }
            else if (exponent >= 0x7FF) /*< Too large to represent. Translate as infinite. */
            {
                d[7] |= 0x7F;
                d[6] = 0xF0;
                memset(d, 0x00, 6);
                memcpy(&result, d, 8);
                return result;
            }
            else
            {
                /* Representable number */
                d[7] |= (exponent & 0x7F0) >> 4;
                d[6] |= (exponent & 0xF) << 4;
            }
        }
        /* Translate mantissa. */
    
        mantissa >>= 11;
    
        if (exponent < 0)
        {
            /* Denormal, further shifting is required here. */
            mantissa >>= (-exponent + 1);
        }
    
        d[0] = mantissa & 0xFF;
        d[1] = (mantissa >> 8) & 0xFF;
        d[2] = (mantissa >> 16) & 0xFF;
        d[3] = (mantissa >> 24) & 0xFF;
        d[4] = (mantissa >> 32) & 0xFF;
        d[5] = (mantissa >> 40) & 0xFF;
        d[6] |= (mantissa >> 48) & 0x0F;
    
        memcpy(&result, d, 8);
    
        printf("Result: 0x%016llx", *(uint64*)(&result) );
    
        return result;
    }
    
    0 讨论(0)
  • 2020-12-15 14:03

    Played with the given answers and ended up with this.

    #include <cmath>
    #include <limits>
    #include <cassert>
    
    #ifndef _M_X64
    
    __inline __declspec(naked) double _cvt80to64(void* ) {
      __asm {
      //  PUBLIC _cvt80to64 PROC
    
        mov eax, dword ptr [esp+4]
        fld TBYTE PTR [eax]
    
        ret 0
      //    _cvt80to64 ENDP
      }
    }
    
    #endif
    
    #pragma pack(push)
    #pragma pack(2)
    typedef unsigned char tDouble80[10];
    #pragma pack(pop)
    
    
    typedef struct {
      unsigned __int64 mantissa:64;
      unsigned int exponent:15;
      unsigned int sign:1;
    } tDouble80Struct;
    
    inline double convertDouble80(const tDouble80& val)
    {
      assert(10 == sizeof(tDouble80));
    
      const tDouble80Struct* valStruct = reinterpret_cast<const tDouble80Struct*>(&val);
    
      const unsigned int mask_exponent = (1 << 15) - 1;
      const unsigned __int64 mantissa_high_highestbit = unsigned __int64(1) << 63;
      const unsigned __int64 mask_mantissa = (unsigned __int64(1) << 63) - 1;
    
      if (mask_exponent == valStruct->exponent) {
    
        if(0 == valStruct->mantissa) {
          return (0 != valStruct->sign) ? -std::numeric_limits<double>::infinity() : std::numeric_limits<double>::infinity();
        }
    
        // highest mantissa bit set means quiet NaN
        return (0 != (mantissa_high_highestbit & valStruct->mantissa)) ? std::numeric_limits<double>::quiet_NaN() :  std::numeric_limits<double>::signaling_NaN();
      }   
    
      // 80 bit floating point value according to the IEEE-754 specification and 
      // the Standard Apple Numeric Environment specification:
      // 1 bit sign, 15 bit exponent, 1 bit normalization indication, 63 bit mantissa
    
      const double sign(valStruct->sign ? -1 : 1);
    
    
      //If the highest bit of the mantissa is set, then this is a normalized number.
      unsigned __int64 mantissa = valStruct->mantissa;
      double normalizeCorrection = (mantissa & mantissa_high_highestbit) != 0 ? 1 : 0;
      mantissa &= mask_mantissa;
    
      //value = (-1) ^ s * (normalizeCorrection + m / 2 ^ 63) * 2 ^ (e - 16383)
      return (sign * (normalizeCorrection + double(mantissa) / mantissa_high_highestbit) * pow(2.0, int(valStruct->exponent) - 16383));
    }
    
    0 讨论(0)
  • 2020-12-15 14:09

    Just did this in x86 code...

        .686P
        .XMM
    
    _TEXT   SEGMENT
    
    EXTRN   __fltused:DWORD
    
    PUBLIC  _cvt80to64
    PUBLIC  _cvt64to80
    
    _cvt80to64 PROC
    
        mov eax, dword ptr [esp+4]
        fld TBYTE PTR [eax]
    
        ret 0
    _cvt80to64 ENDP
    
    
    _cvt64to80 PROC
        mov eax, DWORD PTR [esp+12]
        fld QWORD PTR [esp+4]
        fstp    TBYTE PTR [eax]
        ret 0
    _cvt64to80 ENDP
    
    ENDIF
    
    _TEXT   ENDS
        END
    
    0 讨论(0)
  • 2020-12-15 14:23

    If your compiler / platform doesn't have native support for 80 bit floating point values, you have to decode the value yourself.

    Assuming that the 80 bit float is stored within a byte buffer, located at a specific offset, you can do it like this:

    float64 C_IOHandler::readFloat80(IColl<uint8> buffer, uint32 *ref_offset)
    {
        uint32 &offset = *ref_offset;
    
        //80 bit floating point value according to the IEEE-754 specification and the Standard Apple Numeric Environment specification:
        //1 bit sign, 15 bit exponent, 1 bit normalization indication, 63 bit mantissa
    
        float64 sign;
        if ((buffer[offset] & 0x80) == 0x00)
            sign = 1;
        else
            sign = -1;
        uint32 exponent = (((uint32)buffer[offset] & 0x7F) << 8) | (uint32)buffer[offset + 1];
        uint64 mantissa = readUInt64BE(buffer, offset + 2);
    
        //If the highest bit of the mantissa is set, then this is a normalized number.
        float64 normalizeCorrection;
        if ((mantissa & 0x8000000000000000) != 0x00)
            normalizeCorrection = 1;
        else
            normalizeCorrection = 0;
        mantissa &= 0x7FFFFFFFFFFFFFFF;
    
        offset += 10;
    
        //value = (-1) ^ s * (normalizeCorrection + m / 2 ^ 63) * 2 ^ (e - 16383)
        return (sign * (normalizeCorrection + (float64)mantissa / ((uint64)1 << 63)) * g_Math->toPower(2, (int32)exponent - 16383));
    }
    

    This is how I did it, and it compiles fine with g++ 4.5.0. It of course isn't a very fast solution, but at least a functional one. This code should also be portable to different platforms, though I didn't try.

    0 讨论(0)
提交回复
热议问题