Matrix Multiplication of size 100*100 using SSE Intrinsics
问题 int MAX_DIM = 100; float a[MAX_DIM][MAX_DIM]__attribute__ ((aligned(16))); float b[MAX_DIM][MAX_DIM]__attribute__ ((aligned(16))); float d[MAX_DIM][MAX_DIM]__attribute__ ((aligned(16))); /* * I fill these arrays with some values */ for(int i=0;i<MAX_DIM;i+=1){ for(int j=0;j<MAX_DIM;j+=4){ for(int k=0;k<MAX_DIM;k+=4){ __m128 result = _mm_load_ps(&d[i][j]); __m128 a_line = _mm_load_ps(&a[i][k]); __m128 b_line0 = _mm_load_ps(&b[k][j+0]); __m128 b_line1 = _mm_loadu_ps(&b[k][j+1]); __m128 b_line2