I have an application where part of the inner loop was basically:
double sum = 0;
for (int i = 0; i != N; ++i, ++data, ++x) sum += *data * x;
output with visual studio 2010 with intel Q6600... (note: I increased the loop count from 128*1024 to 512*1024)
release mode...
With int: 4.23944e+009 in 9secs
With unsigned int: 4.23944e+009 in 18secs
With int: 4.23944e+009 in 9secs
debug mode...
With int: 4.23944e+009 in 34secs
With unsigned int: 4.23944e+009 in 58secs
With int: 4.23944e+009 in 34secs
The ASM in release mode... (unsigned)
for (int i = 0; i != Nr_Samples; ++i) {
011714A1 fldz
011714A3 mov edx,dword ptr [esi+4]
011714A6 add esp,4
011714A9 xor edi,edi
011714AB sub edx,dword ptr [esi]
moments_results[i] = moments(dataptr, data.size(), 128);
011714AD mov ecx,dword ptr [ebp-1388Ch]
011714B3 fld st(0)
011714B5 xor eax,eax
011714B7 test edx,edx
011714B9 je measure+79h (11714E9h)
011714BB mov esi,edx
011714BD movzx ebx,byte ptr [ecx]
011714C0 imul ebx,eax
011714C3 mov dword ptr [ebp-138A4h],ebx
011714C9 fild dword ptr [ebp-138A4h] //only in unsigned
011714CF test ebx,ebx //only in unsigned
011714D1 jns measure+69h (11714D9h) //only in unsigned
011714D3 fadd qword ptr [__real@41f0000000000000 (11731C8h)] //only in unsigned
011714D9 inc eax
011714DA faddp st(1),st
011714DC cmp eax,80h
011714E1 jne measure+75h (11714E5h)
011714E3 xor eax,eax
011714E5 inc ecx
011714E6 dec esi
011714E7 jne measure+4Dh (11714BDh)
011714E9 fstp qword ptr [ebp+edi*8-13888h]
011714F0 inc edi
011714F1 cmp edi,2710h
011714F7 jne measure+3Dh (11714ADh)
}
The ASM in release mode... (signed)
for (int i = 0; i != Nr_Samples; ++i) {
012A1351 fldz
012A1353 mov edx,dword ptr [esi+4]
012A1356 add esp,4
012A1359 xor edi,edi
012A135B sub edx,dword ptr [esi]
moments_results[i] = moments(dataptr, data.size(), 128);
012A135D mov ecx,dword ptr [ebp-13890h]
012A1363 fld st(0)
012A1365 xor eax,eax
012A1367 test edx,edx
012A1369 je measure+6Fh (12A138Fh)
012A136B mov esi,edx
012A136D movzx ebx,byte ptr [ecx]
012A1370 imul ebx,eax
012A1373 mov dword ptr [ebp-1388Ch],ebx
012A1379 inc eax
012A137A fild dword ptr [ebp-1388Ch] //only in signed
012A1380 faddp st(1),st
012A1382 cmp eax,80h
012A1387 jne measure+6Bh (12A138Bh)
012A1389 xor eax,eax
012A138B inc ecx
012A138C dec esi
012A138D jne measure+4Dh (12A136Dh)
012A138F fstp qword ptr [ebp+edi*8-13888h]
012A1396 inc edi
012A1397 cmp edi,2710h
012A139D jne measure+3Dh (12A135Dh)
}
interesting... with release mode and SSE enabled..... (fld and flds instructions removed but 4 instructions added)
With int: 4.23944e+009 in 8secs
With unsigned int: 4.23944e+009 in 10secs
With int: 4.23944e+009 in 8secs
for (int i = 0; i != Nr_Samples; ++i) {
00F614C1 mov edx,dword ptr [esi+4]
00F614C4 xorps xmm0,xmm0 //added in sse version
00F614C7 add esp,4
00F614CA xor edi,edi
00F614CC sub edx,dword ptr [esi]
moments_results[i] = moments(dataptr, data.size(), 128);
00F614CE mov ecx,dword ptr [ebp-13894h]
00F614D4 xor eax,eax
00F614D6 movsd mmword ptr [ebp-13890h],xmm0 //added in sse version
00F614DE test edx,edx
00F614E0 je measure+8Ch (0F6151Ch)
00F614E2 fld qword ptr [ebp-13890h] //added in sse version
00F614E8 mov esi,edx
00F614EA movzx ebx,byte ptr [ecx]
00F614ED imul ebx,eax
00F614F0 mov dword ptr [ebp-1388Ch],ebx
00F614F6 fild dword ptr [ebp-1388Ch]
00F614FC test ebx,ebx
00F614FE jns measure+76h (0F61506h)
00F61500 fadd qword ptr [__real@41f0000000000000 (0F631C8h)]
00F61506 inc eax
00F61507 faddp st(1),st
00F61509 cmp eax,80h
00F6150E jne measure+82h (0F61512h)
00F61510 xor eax,eax
00F61512 inc ecx
00F61513 dec esi
00F61514 jne measure+5Ah (0F614EAh)
00F61516 fstp qword ptr [ebp-13890h]
00F6151C movsd xmm1,mmword ptr [ebp-13890h] //added in sse version
00F61524 movsd mmword ptr [ebp+edi*8-13888h],xmm1 //added in sse version
00F6152D inc edi
00F6152E cmp edi,2710h
00F61534 jne measure+3Eh (0F614CEh)
}