SSE matrix-matrix multiplication
I'm having trouble doing matrix-matrix multiplication with SSE in C. Here is what I got so far: #define N 1000 void matmulSSE(int mat1[N][N], int mat2[N][N], int result[N][N]) { int i, j, k; __m128i vA, vB, vR; for(i = 0; i < N; ++i) { for(j = 0; j < N; ++j) { vR = _mm_setzero_si128(); for(k = 0; k < N; k += 4) { //result[i][j] += mat1[i][k] * mat2[k][j]; vA = _mm_loadu_si128((__m128i*)&mat1[i][k]); vB = _mm_loadu_si128((__m128i*)&mat2[k][j]); //how well does the k += 4 work here? Should it be unrolled? vR = _mm_add_epi32(vR, _mm_mul_epi32(vA, vB)); } vR = _mm_hadd_epi32(vR, vR); vR = _mm_hadd