问题
The title is in lack of a better name, and I am not sure I managed to explain myself clearly enough. I am looking for a way to access a "data type" via an index, but not force the compiler to keep it in an array. The problem occurs in writing a low-level code based on SSE/AVX intrinsics.
For ease of programming I would like to write code as the following, with fixed-length loops over "registers" (data type __m512
):
inline void load(__m512 *vector, const float *in)
{
for(int i=0; i<24; i++)
vector[i] = _mm512_load_ps((in + i*SIMD_WIDTH));
}
// similarely: inline add(...) and inline store(...)
void add(float *in_out, const float *in)
{
__m512 vector1[24];
__m512 vector2[24];
load(vector1, in_out);
load(vector2, in);
add(vector1, vector2);
store(in_out, vector1);
}
The fact that vector1
and vector2
are defined as arrays appears to be troublesome for the compiler (icc
in my case): it seems to be forced to make it "addressable", keeps it on the stack, and thus generates a lot of load
and store
instructions that I do not need. As far is I understand this is to allow for pointer arithmetics with vector1
or vector2
.
I would like the compiler to keep everything only in registers. I know that this is possible: if I write the code without the __m512
arrays but many individual __m512
variables the compiler generates better code.
Solutions:
I (unsuccessfully) tried using a class like the following (I had hoped that the switch-statement and everything else would get optimized out):
class Vector
{
inline __m512 & operator[](int i)
{
switch(i)
case 0: return component0;
// ...
case 23: return component23;
}
__m512 component0;
// ...
__m512 component23;
};
I have also considered macros, but could not find a good solution.
Any suggestions?
Thanks,
Simon
Following a comment in an answer below, here a more detailed example of what I want to do (this is still a simplification though):
inline void project(__m512 *projected_vector, __m512 *vector)
{
for(int i=0; i<3; i++)
projected_vector[i] = _mm512_add_ps(vector[i], vector[i+3]);
}
inline void matrix_multiply(__m512 *out, const float *matrix, __m512 *in)
{
for(int i=0; i<3; i++)
{
out[i] = _mm512_mul_ps( matrix[3*i+0], in[0]);
out[i] = _mm512_fmadd_ps(matrix[3*i+1], in[1], out[i]);
out[i] = _mm512_fmadd_ps(matrix[3*i+2], in[2], out[i]);
}
}
inline void reconstruct(__m512 *vector, __m512 *projected_vector)
{
for(int i=0; i<3; i++)
vector[i] = _mm512_add_ps(vector[i], projected_vector[i]);
for(int i=0; i<3; i++)
vector[i+3] = _mm512_sub_ps(vector[i], projected_vector[i]);
}
inline void hopping_term(float *in_out, const float *matrix_3x3, const float *in)
{
__m512 vector_in[6];
__m512 vector_out[6];
__m512 half_vector1[3];
__m512 half_vector2[3];
load(vector_in, in);
project(half_vector1, vector_in);
matrix_multiply(half_vector2, matrix_3x3, half_vector1);
load(vector_out, in_out);
reconstruct(vector_out, half_vector2);
store(in_out, vector_out);
}
回答1:
I did something very similar to this recently using template meta programming. In the following code I think you just need to replace _mm256
with _mm512
and change the SIMD_WIDTH
to 16
. This should unroll your loop 24 times.
#include <x86intrin.h>
#define SIMD_WIDTH 8
#define LEN 24*SIMD_WIDTH
template<int START, int N>
struct Repeat {
static void add (float * x, float * y, float * z) {
_mm256_store_ps(&z[START],_mm256_add_ps(_mm256_load_ps(&x[START]) ,_mm256_load_ps(&y[START])));
Repeat<START+SIMD_WIDTH, N>::add(x,y,z);
}
};
template<int N>
struct Repeat<LEN, N> {
static void add (float * x, float * y, float * z) {}
};
void sum_unroll(float *x, float *y, float *z, const int n) {
Repeat<0,LEN>::add(x,y,z);
}
When I compile with g++ -mavx -O3 -S -masm=intel
the assembly looks like
vmovaps ymm0, YMMWORD PTR [rdi]
vaddps ymm0, ymm0, YMMWORD PTR [rsi]
vmovaps YMMWORD PTR [rdx], ymm0
vmovaps ymm0, YMMWORD PTR [rdi+32]
vaddps ymm0, ymm0, YMMWORD PTR [rsi+32]
vmovaps YMMWORD PTR [rdx+32], ymm0
vmovaps ymm0, YMMWORD PTR [rdi+64]
vaddps ymm0, ymm0, YMMWORD PTR [rsi+64]
vmovaps YMMWORD PTR [rdx+64], ymm0
vmovaps ymm0, YMMWORD PTR [rdi+96]
vaddps ymm0, ymm0, YMMWORD PTR [rsi+96]
vmovaps YMMWORD PTR [rdx+96], ymm0
vmovaps ymm0, YMMWORD PTR [rdi+128]
vaddps ymm0, ymm0, YMMWORD PTR [rsi+128]
vmovaps YMMWORD PTR [rdx+128], ymm0
...
vmovaps ymm0, YMMWORD PTR [rdi+736]
vaddps ymm0, ymm0, YMMWORD PTR [rsi+736]
vmovaps YMMWORD PTR [rdx+736], ymm0
I have used this method successfully recently where I'm trying to push through 4 micro ops at once (more using fusing). For example to do two loads, one store, and one two FMA in one clock cycle. I checked the results to make sure there were no errors. I was able to increase the efficiency a bit in a few of my tests doing this.
Edit: Here is a solution based on the OPs updated question. I have had problems using arrays for SIMD variables in the pass as well so I don't usually use arrays with them. Also, due to register renaming I rarely have to use many SIMD registers (I think the most I have used is 11). In this example only five are necessary.
#include <x86intrin.h>
#define SIMD_WIDTH 8
static inline __m256 load(const float *in) {
return _mm256_loadu_ps(in);
}
inline void store(float *out, __m256 const &vector) {
_mm256_storeu_ps(out, vector);
}
inline __m256 project(__m256 const &a, __m256 const &b) {
return _mm256_add_ps(a, b);
}
inline void reconstruct(__m256 &vector1, __m256 &vector2, __m256 &projected_vector) {
vector1 = _mm256_add_ps(vector1, projected_vector);
vector2 = _mm256_sub_ps(vector1, projected_vector);
}
class So {
public:
__m256 half_vector[3];
So(const float *in) {
for(int i=0; i<3; i++)
half_vector[i] = project(load(&in[i*SIMD_WIDTH]), load(&in[(3+i)*SIMD_WIDTH]));
}
__m256 matrix_multiply(const float *matrix) {
__m256 out;
out = _mm256_mul_ps(_mm256_loadu_ps(&matrix[0]), half_vector[0]);
out = _mm256_fmadd_ps(_mm256_loadu_ps(&matrix[1]), half_vector[1], out);
out = _mm256_fmadd_ps(_mm256_loadu_ps(&matrix[2]), half_vector[2], out);
return out;
}
};
void hopping_term(float *in_out, const float *matrix_3x3, const float *in)
{
So so(in);
for(int i=0; i<3; i++) {
__m256 vector_out1, vector_out2;
__m256 half_vector2 = so.matrix_multiply(&matrix_3x3[3*i]);
vector_out1 = load(&in_out[i*SIMD_WIDTH]);
reconstruct(vector_out1, vector_out2, half_vector2);
store(&in_out[(0+i)*SIMD_WIDTH], vector_out1);
store(&in_out[(3+i)*SIMD_WIDTH], vector_out2);
}
}
This only uses five AVX registers. Here is the assembly
vmovups ymm3, YMMWORD PTR [rdx]
vmovups ymm2, YMMWORD PTR [rdx+32]
vaddps ymm3, ymm3, YMMWORD PTR [rdx+96]
vmovups ymm0, YMMWORD PTR [rdx+64]
vaddps ymm2, ymm2, YMMWORD PTR [rdx+128]
vaddps ymm0, ymm0, YMMWORD PTR [rdx+160]
vmulps ymm1, ymm3, YMMWORD PTR [rsi]
vfmadd231ps ymm1, ymm2, YMMWORD PTR [rsi+4]
vfmadd231ps ymm1, ymm0, YMMWORD PTR [rsi+8]
vaddps ymm4, ymm1, YMMWORD PTR [rdi]
vsubps ymm1, ymm4, ymm1
vmovups YMMWORD PTR [rdi], ymm4
vmovups YMMWORD PTR [rdi+96], ymm1
vmulps ymm1, ymm3, YMMWORD PTR [rsi+12]
vfmadd231ps ymm1, ymm2, YMMWORD PTR [rsi+16]
vfmadd231ps ymm1, ymm0, YMMWORD PTR [rsi+20]
vaddps ymm4, ymm1, YMMWORD PTR [rdi+32]
vsubps ymm1, ymm4, ymm1
vmovups YMMWORD PTR [rdi+32], ymm4
vmovups YMMWORD PTR [rdi+128], ymm1
vmulps ymm3, ymm3, YMMWORD PTR [rsi+24]
vfmadd132ps ymm2, ymm3, YMMWORD PTR [rsi+28]
vfmadd132ps ymm0, ymm2, YMMWORD PTR [rsi+32]
vaddps ymm1, ymm0, YMMWORD PTR [rdi+64]
vsubps ymm0, ymm1, ymm0
vmovups YMMWORD PTR [rdi+64], ymm1
vmovups YMMWORD PTR [rdi+160], ymm0
回答2:
To give an answer to my own question: I have come up with a solution based on a combination of templates and macros.
- a
struct
holds the "vector" of__m512
variables - a template function is used to access elements of the
struct
- the index to the vector is passed as a template parameter, so the compiler manages to optimize away the switch statement in the access function
- we cannot use a c-loop over the template parameter, so a variadic macro is used to mimic loops
Example:
struct RegisterVector
{
__m512 c0;
__m512 c1;
__m512 c2;
__m512 c3;
__m512 c4;
__m512 c5;
};
template <int vector_index> __m512 &element(RegisterVector &vector)
{
switch(vector_index)
{
case 0: return vector.c0;
case 1: return vector.c1;
case 2: return vector.c2;
case 3: return vector.c3;
case 4: return vector.c4;
case 5: return vector.c5;
}
}
#define LOOP3(loop_variable, start, ...) \
do { \
{ const int loop_variable = start + 0; __VA_ARGS__; } \
{ const int loop_variable = start + 1; __VA_ARGS__; } \
{ const int loop_variable = start + 2; __VA_ARGS__; } \
} while(0)
// simple usage example
LOOP3(c, 0, _mm512_add_ps(element<2*c+0>(vector1), element<2*c+1>(vector2)));
// more complex example: more than one instruction in the loop, cmul and cfmadd itself are inline functions for a complex mul or fmadd
LOOP3(r, 0,
cmul (su3[2*(3*0+r)+0], su3[2*(3*0+r)+1], element<2*0+0>(in), element<2*0+1>(in), element<2*r+0>(out), element<2*r+1>(out));
cfmadd(su3[2*(3*1+r)+0], su3[2*(3*1+r)+1], element<2*1+0>(in), element<2*1+1>(in), element<2*r+0>(out), element<2*r+1>(out));
cfmadd(su3[2*(3*2+r)+0], su3[2*(3*2+r)+1], element<2*2+0>(in), element<2*2+1>(in), element<2*r+0>(out), element<2*r+1>(out)));
As you can see you can do integer arithmetic in the template arguments as usual, and the loop index can also be used to access other arrays.
The LOOP
macro could probably still be improved, but for my purposes it is sufficient.
Simon
来源:https://stackoverflow.com/questions/25812252/temporary-non-addressable-fixed-size-array