Speed difference between using int and unsigned int when mixed with doubles

后端 未结 4 2012
鱼传尺愫
鱼传尺愫 2021-01-01 14:22

I have an application where part of the inner loop was basically:

double sum = 0;
for (int i = 0; i != N; ++i, ++data, ++x) sum += *data * x;
4条回答
  •  情话喂你
    2021-01-01 14:38

    output with visual studio 2010 with intel Q6600... (note: I increased the loop count from 128*1024 to 512*1024)

    release mode...

    With int: 4.23944e+009 in 9secs
    With unsigned int: 4.23944e+009 in 18secs
    With int: 4.23944e+009 in 9secs
    

    debug mode...

    With int: 4.23944e+009 in 34secs
    With unsigned int: 4.23944e+009 in 58secs
    With int: 4.23944e+009 in 34secs
    

    The ASM in release mode... (unsigned)

        for (int i = 0; i != Nr_Samples; ++i) { 
    011714A1  fldz  
    011714A3  mov         edx,dword ptr [esi+4]  
    011714A6  add         esp,4  
    011714A9  xor         edi,edi  
    011714AB  sub         edx,dword ptr [esi]  
            moments_results[i] = moments(dataptr, data.size(), 128); 
    011714AD  mov         ecx,dword ptr [ebp-1388Ch]  
    011714B3  fld         st(0)  
    011714B5  xor         eax,eax  
    011714B7  test        edx,edx  
    011714B9  je          measure+79h (11714E9h)  
    011714BB  mov         esi,edx  
    011714BD  movzx       ebx,byte ptr [ecx]  
    011714C0  imul        ebx,eax  
    011714C3  mov         dword ptr [ebp-138A4h],ebx  
    011714C9  fild        dword ptr [ebp-138A4h]  //only in unsigned
    011714CF  test        ebx,ebx  //only in unsigned
    011714D1  jns         measure+69h (11714D9h)  //only in unsigned
    011714D3  fadd        qword ptr [__real@41f0000000000000 (11731C8h)]  //only in unsigned
    011714D9  inc         eax  
    011714DA  faddp       st(1),st  
    011714DC  cmp         eax,80h  
    011714E1  jne         measure+75h (11714E5h)  
    011714E3  xor         eax,eax  
    011714E5  inc         ecx  
    011714E6  dec         esi  
    011714E7  jne         measure+4Dh (11714BDh)  
    011714E9  fstp        qword ptr [ebp+edi*8-13888h]  
    011714F0  inc         edi  
    011714F1  cmp         edi,2710h  
    011714F7  jne         measure+3Dh (11714ADh)  
        } 
    

    The ASM in release mode... (signed)

        for (int i = 0; i != Nr_Samples; ++i) { 
    012A1351  fldz  
    012A1353  mov         edx,dword ptr [esi+4]  
    012A1356  add         esp,4  
    012A1359  xor         edi,edi  
    012A135B  sub         edx,dword ptr [esi]  
            moments_results[i] = moments(dataptr, data.size(), 128); 
    012A135D  mov         ecx,dword ptr [ebp-13890h]  
    012A1363  fld         st(0)  
    012A1365  xor         eax,eax  
    012A1367  test        edx,edx  
    012A1369  je          measure+6Fh (12A138Fh)  
    012A136B  mov         esi,edx  
    012A136D  movzx       ebx,byte ptr [ecx]  
    012A1370  imul        ebx,eax  
    012A1373  mov         dword ptr [ebp-1388Ch],ebx  
    012A1379  inc         eax  
    012A137A  fild        dword ptr [ebp-1388Ch]  //only in signed
    012A1380  faddp       st(1),st  
    012A1382  cmp         eax,80h  
    012A1387  jne         measure+6Bh (12A138Bh)  
    012A1389  xor         eax,eax  
    012A138B  inc         ecx  
    012A138C  dec         esi  
    012A138D  jne         measure+4Dh (12A136Dh)  
    012A138F  fstp        qword ptr [ebp+edi*8-13888h]  
    012A1396  inc         edi  
    012A1397  cmp         edi,2710h  
    012A139D  jne         measure+3Dh (12A135Dh)  
        } 
    

    interesting... with release mode and SSE enabled..... (fld and flds instructions removed but 4 instructions added)

    With int: 4.23944e+009 in 8secs
    With unsigned int: 4.23944e+009 in 10secs
    With int: 4.23944e+009 in 8secs
    
    
        for (int i = 0; i != Nr_Samples; ++i) { 
    00F614C1  mov         edx,dword ptr [esi+4]  
    00F614C4  xorps       xmm0,xmm0  //added in sse version
    00F614C7  add         esp,4  
    00F614CA  xor         edi,edi  
    00F614CC  sub         edx,dword ptr [esi]  
            moments_results[i] = moments(dataptr, data.size(), 128); 
    00F614CE  mov         ecx,dword ptr [ebp-13894h]  
    00F614D4  xor         eax,eax  
    00F614D6  movsd       mmword ptr [ebp-13890h],xmm0  //added in sse version
    00F614DE  test        edx,edx  
    00F614E0  je          measure+8Ch (0F6151Ch)  
    00F614E2  fld         qword ptr [ebp-13890h]  //added in sse version
    00F614E8  mov         esi,edx  
    00F614EA  movzx       ebx,byte ptr [ecx]  
    00F614ED  imul        ebx,eax  
    00F614F0  mov         dword ptr [ebp-1388Ch],ebx  
    00F614F6  fild        dword ptr [ebp-1388Ch]  
    00F614FC  test        ebx,ebx  
    00F614FE  jns         measure+76h (0F61506h)  
    00F61500  fadd        qword ptr [__real@41f0000000000000 (0F631C8h)]  
    00F61506  inc         eax  
    00F61507  faddp       st(1),st  
    00F61509  cmp         eax,80h  
    00F6150E  jne         measure+82h (0F61512h)  
    00F61510  xor         eax,eax  
    00F61512  inc         ecx  
    00F61513  dec         esi  
    00F61514  jne         measure+5Ah (0F614EAh)  
    00F61516  fstp        qword ptr [ebp-13890h]  
    00F6151C  movsd       xmm1,mmword ptr [ebp-13890h]  //added in sse version
    00F61524  movsd       mmword ptr [ebp+edi*8-13888h],xmm1  //added in sse version
    00F6152D  inc         edi  
    00F6152E  cmp         edi,2710h  
    00F61534  jne         measure+3Eh (0F614CEh)  
        } 
    

提交回复
热议问题