Peak FLOPs per cycle for ARM11 and Cortex-A7 cores in Raspberry Pi 1 and 2

前端 未结 1 2025
陌清茗
陌清茗 2020-12-21 04:43

I would like to know the peak FLOPs per cycle for the ARM1176JZF-S core in the the Raspberry Pi 1 and Cortex-A7 cores in the Raspberry Pi 2.


From the ARM1176JZ

相关标签:
1条回答
  • 2020-12-21 04:52

    Example 1 Compiled code MP-MFLOPSPiNeon that obtains >647 MFLOPS (data words 3.2k to 3.2M) on a 900 MHz Rpi2. Disassembly seems to be the same without threading. Compile/link command used and C code for 32 operations per data word are below [Someone might suggest faster compile options].

       MP-MFLOPS Compiled NEON v1.0
    
       gcc mpmflops.c cpuidc.c -lrt -lc -lm -O3 -mcpu=cortex-a7
        -mfloat-abi=hard -mfpu=neon-vfpv4 -funsafe-math-optimizations -lpthread -o MP-MFLOPSPiNeon
    
       32 OPs/Word 1 CPU 692 MFLOPS
    
     void triadplus2(int n, float a, float b, float c, float d,
                     float e, float f, float g, float h, float j,
                     float k, float l, float m, float o, float p,
                     float q, float r, float s, float t, float u,
                     float v, float w, float y, float *x)
     {
         int i;
         for(i=0; i<n; i++)
         x[i] = (x[i]+a)*b-(x[i]+c)*d+(x[i]+e)*f-(x[i]+g)*h+(x[i]+j)*k
         -(x[i]+l)*m+(x[i]+o)*p-(x[i]+q)*r+(x[i]+s)*t-(x[i]+u)*v+(x[i]+w)*y;
     }
    

    Following is complex disassembly. Note highlighted fused multiply accumulate or subtract instructions with an excessive number of loads

     triadplus2:
    
        @ args = 24, pretend = 0, frame = 272
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        stmfd   sp!, {r4, r5, r6, r7}
        cmp     r0, #0
        fstmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15}
        sub     sp, sp, #272
        flds    s21, [sp, #352]
        flds    s18, [sp, #356]
        flds    s19, [sp, #360]
        flds    s16, [sp, #364]
        flds    s20, [sp, #368]
        flds    s17, [sp, #372]
        ble     .L57
        sbfx    r3, r1, #2, #1
        and     r3, r3, #3
        cmp     r3, r0
        movcs   r3, r0
        cmp     r0, #4
        movls   r3, r0
        bhi     .L80
     LOOP HERE  
    .L59:
        flds    s23, [r1]
        cmp     r3, #1
        fadds   s22, s23, s4
        movls   r2, #1
        fadds   s24, s23, s0
        fadds   s31, s23, s8
        fadds   s30, s23, s12
        fmuls   s22, s22, s5
        fadds   s29, s23, s21
        fadds   s28, s23, s20
        fadds   s27, s23, s6
        vfma.f32        s22, s24, s1
        fadds   s26, s23, s2
        fadds   s25, s23, s10
        fadds   s24, s23, s14
        fadds   s23, s23, s19
        vfma.f32        s22, s31, s9
        vfma.f32        s22, s30, s13
        vfma.f32        s22, s29, s18
        vfma.f32        s22, s28, s17
        vfms.f32        s22, s27, s7
        vfms.f32        s22, s26, s3
        vfms.f32        s22, s25, s11
        vfms.f32        s22, s24, s15
        vfms.f32        s22, s23, s16
        fsts    s22, [r1]
        bls     .L61
        flds    s23, [r1, #4]
        cmp     r3, #2
        fadds   s22, s23, s4
        movls   r2, #2
        fadds   s24, s23, s0
        fadds   s31, s23, s8
        fadds   s30, s23, s12
        fmuls   s22, s22, s5
        fadds   s29, s23, s21
        fadds   s28, s23, s20
        fadds   s27, s23, s6
        vfma.f32        s22, s24, s1
        fadds   s26, s23, s2
        fadds   s25, s23, s10
        fadds   s24, s23, s14
        fadds   s23, s23, s19
        vfma.f32        s22, s31, s9
        vfma.f32        s22, s30, s13
        vfma.f32        s22, s29, s18
        vfma.f32        s22, s28, s17
        vfms.f32        s22, s27, s7
        vfms.f32        s22, s26, s3
        vfms.f32        s22, s25, s11
        vfms.f32        s22, s24, s15
        vfms.f32        s22, s23, s16
        fsts    s22, [r1, #4]
        bls     .L61
        flds    s23, [r1, #8]
        cmp     r3, #3
        fadds   s22, s23, s4
        movls   r2, #3
        fadds   s24, s23, s0
        fadds   s31, s23, s8
        fadds   s30, s23, s12
        fmuls   s22, s22, s5
        fadds   s29, s23, s21
        fadds   s28, s23, s20
        fadds   s27, s23, s6
        vfma.f32        s22, s24, s1
        fadds   s26, s23, s2
        fadds   s25, s23, s10
        fadds   s24, s23, s14
        fadds   s23, s23, s19
        vfma.f32        s22, s31, s9
        vfma.f32        s22, s30, s13
        vfma.f32        s22, s29, s18
        vfma.f32        s22, s28, s17
        vfms.f32        s22, s27, s7
        vfms.f32        s22, s26, s3
        vfms.f32        s22, s25, s11
        vfms.f32        s22, s24, s15
        vfms.f32        s22, s23, s16
        fsts    s22, [r1, #8]
        bls     .L61
        flds    s23, [r1, #12]
        mov     r2, #4
        fadds   s22, s23, s20
        fadds   s24, s23, s21
        fadds   s31, s23, s12
        fadds   s30, s23, s8
        fmuls   s22, s22, s17
        fadds   s29, s23, s4
        fadds   s28, s23, s0
        fadds   s27, s23, s6
        vfma.f32        s22, s24, s18
        fadds   s26, s23, s2
        fadds   s25, s23, s10
        fadds   s24, s23, s14
        fadds   s23, s23, s19
        vfma.f32        s22, s31, s13
        vfma.f32        s22, s30, s9
        vfma.f32        s22, s29, s5
        vfma.f32        s22, s28, s1
        vfms.f32        s22, s27, s7
        vfms.f32        s22, s26, s3
        vfms.f32        s22, s25, s11
        vfms.f32        s22, s24, s15
        vfms.f32        s22, s23, s16
        fsts    s22, [r1, #12]
     .L61:
        cmp     r3, r0
        beq     .L57
        rsb     r6, r3, r0
        mov     r4, r6, lsr #2
        movs    r7, r4, asl #2
        beq     .L63
     .L81:
        vdup.32 q12, d1[1]
        vdup.32 q8, d0[0]
        vdup.32 q10, d0[1]
        vdup.32 q11, d1[0]
        vstr    d24, [sp, #64]
        vstr    d25, [sp, #72]
        vdup.32 q12, d3[1]
        vstr    d16, [sp, #16]
        vstr    d17, [sp, #24]
        vstr    d20, [sp, #32]
        vstr    d21, [sp, #40]
        vdup.32 q8, d2[0]
        vdup.32 q10, d2[1]
        vstr    d22, [sp, #48]
        vstr    d23, [sp, #56]
        vstr    d24, [sp, #128]
        vstr    d25, [sp, #136]
        vdup.32 q11, d3[0]
        vdup.32 q12, d5[1]
        vstr    d16, [sp, #80]
        vstr    d17, [sp, #88]
        vstr    d20, [sp, #96]
        vstr    d21, [sp, #104]
        vdup.32 q8, d4[0]
        vdup.32 q10, d4[1]
        vstr    d22, [sp, #112]
        vstr    d23, [sp, #120]
        vstr    d24, [sp, #192]
        vstr    d25, [sp, #200]
        vdup.32 q11, d5[0]
        vdup.32 q12, d10[0]
        vstr    d16, [sp, #144]
        vstr    d17, [sp, #152]
        vstr    d20, [sp, #160]
        vstr    d21, [sp, #168]
        vstr    d22, [sp, #176]
        vstr    d23, [sp, #184]
        vdup.32 q8, d6[0]
        vdup.32 q10, d9[1]
        vdup.32 q11, d8[0]
        vstr    d24, [sp, #256]
        vstr    d25, [sp, #264]
        vdup.32 q12, d8[1]
        vstr    d16, [sp, #208]
        vstr    d17, [sp, #216]
        vdup.32 q7, d6[1]
        vdup.32 q6, d7[0]
        vdup.32 q15, d7[1]
        vdup.32 q14, d10[1]
        vdup.32 q13, d9[0]
        vstr    d20, [sp, #224]
        vstr    d21, [sp, #232]
        vstr    d22, [sp, #240]
        vstr    d23, [sp, #248]
        vst1.64 {d24-d25}, [sp:64]
        add     r3, r1, r3, asl #2
        mov     ip, #0
        mov     r5, r3
     .L69:
    
        vfma FUSED MULTIPLY ACCUMULATE or vfms SUBTRACT QUAD WORDS
    
        vld1.64 {d18-d19}, [r3:64]!
        vldr    d20, [sp, #80]
        vldr    d21, [sp, #88]
        vldr    d22, [sp, #16]
        vldr    d23, [sp, #24]
        vadd.f32        q8, q9, q10
        vldr    d24, [sp, #96]
        vldr    d25, [sp, #104]
        vadd.f32        q10, q9, q11
        vmul.f32        q8, q8, q12
        vldr    d22, [sp, #32]
        vldr    d23, [sp, #40]
        vldr    d24, [sp, #144]
        vldr    d25, [sp, #152]
        vfma.f32        q8, q10, q11
        add     ip, ip, #1
        vadd.f32        q11, q9, q12
        vldr    d24, [sp, #208]
        vldr    d25, [sp, #216]
        cmp     r4, ip
        vadd.f32        q10, q9, q12
        vldr    d24, [sp, #160]
        vldr    d25, [sp, #168]
        vfma.f32        q8, q11, q12
        vadd.f32        q11, q9, q14
        vldr    d24, [sp, #256]
        vldr    d25, [sp, #264]
        vfma.f32        q8, q10, q7
        vadd.f32        q10, q9, q12
        vldr    d24, [sp, #112]
        vldr    d25, [sp, #120]
        vfma.f32        q8, q11, q13
        vadd.f32        q11, q9, q12
        vld1.64 {d24-d25}, [sp:64]
        vfma.f32        q8, q10, q12
        vldr    d24, [sp, #48]
        vldr    d25, [sp, #56]
        vadd.f32        q10, q9, q12
        vldr    d24, [sp, #128]
        vldr    d25, [sp, #136]
        vfms.f32        q8, q11, q12
        vldr    d24, [sp, #176]
        vldr    d25, [sp, #184]
        vadd.f32        q11, q9, q12
        vldr    d24, [sp, #64]
        vldr    d25, [sp, #72]
        vfms.f32        q8, q10, q12
        vldr    d24, [sp, #224]
        vldr    d25, [sp, #232]
        vadd.f32        q10, q9, q6
        vadd.f32        q9, q9, q12
        vldr    d24, [sp, #192]
        vldr    d25, [sp, #200]
        vfms.f32        q8, q11, q12
        vfms.f32        q8, q10, q15
        vldr    d20, [sp, #240]
        vldr    d21, [sp, #248]
        vfms.f32        q8, q9, q10
        vst1.64 {d16-d17}, [r5:64]!
        bhi     .L69
    
        END vfma FUSED MULTIPLY ACCUMULATE or vfms SUBTRACT QUAD WORDS
    
        cmp     r7, r6
        add     r2, r2, r7
        beq     .L57
     .L63:
        add     ip, r1, r2, asl #2
        add     r3, r2, #1
        cmp     r0, r3
        flds    s23, [ip]
        fadds   s22, s23, s4
        fadds   s24, s23, s0
        fadds   s31, s23, s8
        fadds   s30, s23, s12
        fmuls   s22, s22, s5
        fadds   s29, s23, s21
        fadds   s28, s23, s20
        fadds   s27, s23, s2
        vfma.f32        s22, s24, s1
        fadds   s26, s23, s6
        fadds   s25, s23, s14
        fadds   s24, s23, s10
        fadds   s23, s23, s19
        vfma.f32        s22, s31, s9
        vfma.f32        s22, s30, s13
        vfma.f32        s22, s29, s18
        vfma.f32        s22, s28, s17
        vfms.f32        s22, s27, s3
        vfms.f32        s22, s26, s7
        vfms.f32        s22, s25, s15
        vfms.f32        s22, s24, s11
        vfms.f32        s22, s23, s16
        fsts    s22, [ip]
        ble     .L57
        add     r3, r1, r3, asl #2
        add     r2, r2, #2
        cmp     r0, r2
        flds    s23, [r3]
        fadds   s22, s23, s4
        fadds   s24, s23, s0
        fadds   s31, s23, s8
        fadds   s30, s23, s12
        fmuls   s22, s22, s5
        fadds   s29, s23, s21
        fadds   s28, s23, s20
        fadds   s27, s23, s6
        vfma.f32        s22, s24, s1
        fadds   s26, s23, s2
        fadds   s25, s23, s10
        fadds   s24, s23, s14
        fadds   s23, s23, s19
        vfma.f32        s22, s31, s9
        vfma.f32        s22, s30, s13
        vfma.f32        s22, s29, s18
        vfma.f32        s22, s28, s17
        vfms.f32        s22, s27, s7
        vfms.f32        s22, s26, s3
        vfms.f32        s22, s25, s11
        vfms.f32        s22, s24, s15
        vfms.f32        s22, s23, s16
        fsts    s22, [r3]
        ble     .L57
        add     r2, r1, r2, asl #2
        flds    s22, [r2]
        fadds   s4, s22, s4
        fadds   s0, s22, s0
        fadds   s8, s22, s8
        fadds   s12, s22, s12
        fmuls   s5, s4, s5
        fadds   s21, s22, s21
        fadds   s20, s22, s20
        fadds   s6, s22, s6
        vfma.f32        s5, s0, s1
        fadds   s2, s22, s2
        fadds   s10, s22, s10
        fadds   s14, s22, s14
        fadds   s19, s22, s19
        vfma.f32        s5, s8, s9
        vfma.f32        s5, s12, s13
        vfma.f32        s5, s21, s18
        vfma.f32        s5, s20, s17
        vfms.f32        s5, s6, s7
        vfms.f32        s5, s2, s3
        vfms.f32        s5, s10, s11
        vfms.f32        s5, s14, s15
        vfms.f32        s5, s19, s16
        fsts    s5, [r2]
     .L57:
        add     sp, sp, #272
        @ sp needed
        fldmfdd sp!, {d8-d15}
        ldmfd   sp!, {r4, r5, r6, r7}
        bx      lr
     .L80:
        cmp     r3, #0
        moveq   r2, r3
        bne     .L59
    
        rsb     r6, r3, r0
        mov     r4, r6, lsr #2
        movs    r7, r4, asl #2
        bne     .L81
        b       .L63
        .size   triadplus2, .-triadplus2
    

    Example 2 - Using NEON intrinsic functions (from before I knew of fused instructions) > 700 MFLOPS. First C code:

     32 Operations per word
     C NEON Intrinsics 
     n = words 3.2k, 32k, 3.2M
     similar results > 700 MFLOPS.
    
     for(i=0; i<n; i=i+4)
     {
         x41 = vld1q_f32(ptrx1);
    
         z41 = vaddq_f32(x41, a41);
         z41 = vmulq_f32(z41, b41);
    
         z42 = vaddq_f32(x41, c41);
         z42 = vmulq_f32(z42, d41);
         z41 = vsubq_f32(z41, z42);
    
         z42 = vaddq_f32(x41, e41);
         z42 = vmulq_f32(z42, f41);
         z41 = vaddq_f32(z41, z42);
    
         z42 = vaddq_f32(x41, g41);
         z42 = vmulq_f32(z42, h41);
         z41 = vsubq_f32(z41, z42);
    
         z42 = vaddq_f32(x41, j41);
         z42 = vmulq_f32(z42, k41);
         z41 = vaddq_f32(z41, z42);
    
         z42 = vaddq_f32(x41, l41);
         z42 = vmulq_f32(z42, m41);
         z41 = vsubq_f32(z41, z42);
    
         z42 = vaddq_f32(x41, o41);
         z42 = vmulq_f32(z42, p41);
         z41 = vaddq_f32(z41, z42);
    
         z42 = vaddq_f32(x41, q41);
         z42 = vmulq_f32(z42, r41);
         z41 = vsubq_f32(z41, z42);
    
         z42 = vaddq_f32(x41, s41);
         z42 = vmulq_f32(z42, t41);
         z41 = vaddq_f32(z41, z42);
    
         z42 = vaddq_f32(x41, u41);
         z42 = vmulq_f32(z42, v41);
         z41 = vsubq_f32(z41, z42);
    
         z42 = vaddq_f32(x41, w41);
         z42 = vmulq_f32(z42, y41);
         z41 = vaddq_f32(z41, z42);
    
         vst1q_f32(ptrx1, z41);
    
         ptrx1 = ptrx1 + 4;
     }
    

    Next is disassembly, again with excessive load instructions.

    Assembly Code
    
    .L26:
        vld1.32 {d16-d17}, [ip]
        vld1.64 {d20-d21}, [sp:64]
        vadd.f32        q9, q8, q14
        vadd.f32        q11, q8, q10
        vldr    d24, [sp, #16]
        vldr    d25, [sp, #24]
        vmul.f32        q11, q11, q13
        vmul.f32        q9, q9, q12
        vldr    d24, [sp, #32]
        vldr    d25, [sp, #40]
        vsub.f32        q11, q11, q9
        vadd.f32        q10, q8, q12
        vldr    d18, [sp, #48]
        vldr    d19, [sp, #56]
        vldr    d24, [sp, #64]
        vldr    d25, [sp, #72]
        vmul.f32        q10, q10, q9
        vadd.f32        q9, q8, q12
        vadd.f32        q11, q11, q10
        vldr    d20, [sp, #80]
        vldr    d21, [sp, #88]
        vldr    d24, [sp, #96]
        vldr    d25, [sp, #104]
        vmul.f32        q9, q9, q10
        vadd.f32        q10, q8, q12
        vsub.f32        q11, q11, q9
        vldr    d18, [sp, #112]
        vldr    d19, [sp, #120]
        vldr    d24, [sp, #128]
        vldr    d25, [sp, #136]
        vmul.f32        q10, q10, q9
        vadd.f32        q9, q8, q12
        vadd.f32        q11, q11, q10
        vldr    d24, [sp, #160]
        vldr    d25, [sp, #168]
        vldr    d20, [sp, #144]
        vldr    d21, [sp, #152]
        add     r3, r3, #4
        cmp     r0, r3
        vmul.f32        q9, q9, q10
        vadd.f32        q10, q8, q12
        vsub.f32        q11, q11, q9
        vmul.f32        q10, q10, q15
        vadd.f32        q9, q8, q3
        vadd.f32        q11, q11, q10
        vmul.f32        q9, q9, q2
        vadd.f32        q10, q8, q1
        vsub.f32        q11, q11, q9
        vmul.f32        q10, q10, q0
        vadd.f32        q9, q8, q4
        vadd.f32        q10, q11, q10
        vmul.f32        q9, q9, q5
        vadd.f32        q8, q8, q6
        vsub.f32        q10, q10, q9
        vmul.f32        q8, q8, q7
        vadd.f32        q10, q10, q8
        vst1.32 {d20-d21}, [ip]!
        bgt     .L26
    
    0 讨论(0)
提交回复
热议问题