In a piece of C++ code that does something similar to (but not exactly) matrix multiplication, I load 4 contiguous doubles into 4 YMM registers like this:
#
In my matrix multiplication code I only have to use the broadcast once per kernel code but if you really want to load four doubles in one instruction and then broadcast them to four registers you can do it like this
#include
#include
int main() {
double in[] = {1,2,3,4};
double out[4];
__m256d x4 = _mm256_loadu_pd(in);
__m256d t1 = _mm256_permute2f128_pd(x4, x4, 0x0);
__m256d t2 = _mm256_permute2f128_pd(x4, x4, 0x11);
__m256d broad1 = _mm256_permute_pd(t1,0);
__m256d broad2 = _mm256_permute_pd(t1,0xf);
__m256d broad3 = _mm256_permute_pd(t2,0);
__m256d broad4 = _mm256_permute_pd(t2,0xf);
_mm256_storeu_pd(out,broad1);
printf("%f %f %f %f\n", out[0], out[1], out[2], out[3]);
_mm256_storeu_pd(out,broad2);
printf("%f %f %f %f\n", out[0], out[1], out[2], out[3]);
_mm256_storeu_pd(out,broad3);
printf("%f %f %f %f\n", out[0], out[1], out[2], out[3]);
_mm256_storeu_pd(out,broad4);
printf("%f %f %f %f\n", out[0], out[1], out[2], out[3]);
}
Edit: Here is another solution based on Paul R's suggestion.
__m256 t1 = _mm256_broadcast_pd((__m128d*)&b[4*k+0]);
__m256 t2 = _mm256_broadcast_pd((__m128d*)&b[4*k+2]);
__m256d broad1 = _mm256_permute_pd(t1,0);
__m256d broad2 = _mm256_permute_pd(t1,0xf);
__m256d broad3 = _mm256_permute_pd(t2,0);
__m256d broad4 = _mm256_permute_pd(t2,0xf);