Week 2

In the second week we implemented microbenchmarks to measure the execution throughput in form of the floating point operations executed per second of the following instructions:

FMADD (scalar), FP32 variant.
FMLA (vector) with arrangement specifier 4S.
FMLA (vector) with arrangement specifier 2S.

Furthermore we implemented a kernel, which performs a permutation operation on a tensor abc of the form abc -> cba. The a and b dimensions were fixed to 8 and 4 respectively, while the c dimensions was allowed to vary.

Execution Throughput

For each of the given instructions, we generally execute the same instruction repeatedly and measure the total time taken. If we let i denote the total number of instructions executed, t the required execution time and f the floating point operations executed per instruction, then the floating point operations z executed per second of the benchmark may be determined as follows:

z = (i * f) / t

The functions fmadd_kernel, fmla_4s_kernel and fmla_2s_kernel execute their respective instruction repeatedly as described.

However, these functions do not execute their respective instructions at the maximal possible rate. This is because, in these functions, each instruction depends in it’s arguments on the results of it’s preceding instruction. This prevents the CPU from fully utilizing the instruction pipelines. The functions fmadd_kernel_v2, fmla_4s_kernel_v2 and fmla_2s_kernel_v2 avoid this issue.

We obtained the following results when running the benchmarks on the provided Raspberry Pi machines.

fmadd_kernel: 1.12 GFlops
fmla_4s_kernel: 9.59 GFlops
fmla_2s_kernel: 4.80 GFlops
fmadd_kernel_v2: 9.59 GFlops
fmla_4s_kernel_v2: 38.37 GFlops
fmla_2s_kernel_v2: 19.17 GFlops

Permutation

.text   
//    @brief Permutation operation abc->cba
//    @param size_c Size of dimension c.
//    @param abc    Pointer to row-major tensor abc.
//    @param cba    Pointer to row-major tensor cba.

//   void perm_neon_abc_cba(int64_t       size_c
//                          float const * abc,
//                          float       * cba);
    .global perm_neon_abc_cba_a4
perm_neon_abc_cba_a4:

    // c stride
    lsl x5, x0, #4
    // b stride
    lsl x6, x0, #2
    
    // loop counter
    mov x8, xzr
    
    // c*b stride
    mul x7, x5, x6 



c_loop:
    // local pointer
    mov x3, x1
    mov x4, x2

    //move in c dimension
    add x3, x3, x9
    add x4, x4, x9

    ldr q0, [x3]
    add x3, x3, x6   
    ldr q1, [x3]
    add x3, x3, x6   
    ldr q2, [x3]
    add x3, x3, x6   
    ldr q3, [x3]
    add x3, x3, x6   


    ldr q4, [x3]
    add x3, x3, x6   
    ldr q5, [x3]
    add x3, x3, x6   
    ldr q6, [x3]
    add x3, x3, x6   
    ldr q7, [x3]
    add x3, x3, x6   

    zip1 v16.4s, v0.4s, v4.4s 
    zip1 v17.4s, v1.4s, v5.4s 
    zip1 v18.4s, v2.4s, v6.4s 
    zip1 v19.4s, v3.4s, v7.4s 

    ldr q8, [x3]
    add x3, x3, x6   
    ldr q9, [x3]
    add x3, x3, x6   
    ldr q10, [x3]
    add x3, x3, x6   
    ldr q11, [x3]
    add x3, x3, x6   


    ldr q12, [x3]
    add x3, x3, x6   
    ldr q13, [x3]
    add x3, x3, x6   
    ldr q14, [x3]
    add x3, x3, x6   
    ldr q15, [x3]
    add x3, x3, x6   

    zip1 v20.4s, v8.4s, v12.4s 
    zip1 v21.4s, v9.4s, v13.4s 
    zip1 v22.4s, v10.4s, v14.4s 
    zip1 v23.4s, v11.4s, v15.4s 

    zip1 v24.2d, v16.2d, v20.2d
    zip1 v25.2d, v17.2d, v21.2d
    zip1 v26.2d, v18.2d, v22.2d
    zip1 v27.2d, v19.2d, v23.2d

    // stp q24, q25, [x4]
    // stp q26, q27, [x4, #32*2]

    str q24, [x4]
    str q25, [x4, #4*4*2]
    str q25, [x4, #4*4*4]
    str q25, [x4, #4*4*6]

    zip2 v24.2d, v16.2d, v20.2d
    zip2 v25.2d, v17.2d, v21.2d
    zip2 v26.2d, v18.2d, v22.2d
    zip2 v27.2d, v19.2d, v23.2d
    
    add x4, x4, x7
    // stp q24, q25, [x4]
    // stp q26, q27, [x4, #32]

    str q24, [x4]
    str q25, [x4, #4*4*2]
    str q25, [x4, #4*4*4]
    str q25, [x4, #4*4*6]

    // zweite paare der geladenen vektoren

    zip2 v16.4s, v0.4s, v4.4s 
    zip2 v17.4s, v1.4s, v5.4s 
    zip2 v18.4s, v2.4s, v6.4s 
    zip2 v19.4s, v3.4s, v7.4s 

    zip2 v20.4s, v8.4s, v12.4s 
    zip2 v21.4s, v9.4s, v13.4s 
    zip2 v22.4s, v10.4s, v14.4s 
    zip2 v23.4s, v11.4s, v15.4s 

    zip1 v24.2d, v16.2d, v20.2d
    zip1 v25.2d, v17.2d, v21.2d
    zip1 v26.2d, v18.2d, v22.2d
    zip1 v27.2d, v19.2d, v23.2d

    add x4, x4, x7
    stp q24, q25, [x4]
    stp q26, q27, [x4, #32]

    zip2 v24.2d, v16.2d, v20.2d
    zip2 v25.2d, v17.2d, v21.2d
    zip2 v26.2d, v18.2d, v22.2d
    zip2 v27.2d, v19.2d, v23.2d
    
    add x4, x4, x7
    stp q24, q25, [x4]
    stp q26, q27, [x4, #32]

    // jetzte wurde c size 4 verarbeitet damit kommt jetzt schleife für vielfache von 4 

    add x8, x8, #4
    mul x9, x8, x5
    cmp x8, x0
    b.ne c_loop

    ret


    .global perm_neon_abc_cba
perm_neon_abc_cba:

    // c stride
    lsl x5, x0, #4
    // b stride
    lsl x6, x0, #2
    mov x8, xzr
    mov x10, xzr
    
    // c*b stride
    mul x7, x5, x6 
    // loop counter



c_loop:
    // local pointer
    mov x3, x1
    mov x4, x2

    //move in c dimension
    
    add x3, x3, x9
    add x4, x4, x9

a_loop:

    ldr q0, [x3]
    add x3, x3, x6   
    ldr q1, [x3]
    add x3, x3, x6   
    ldr q2, [x3]
    add x3, x3, x6   
    ldr q3, [x3]
    add x3, x3, x6   


    ldr q4, [x3]
    add x3, x3, x6   
    ldr q5, [x3]
    add x3, x3, x6   
    ldr q6, [x3]
    add x3, x3, x6   
    ldr q7, [x3]
    add x3, x3, x6   

    zip1 v16.4s, v0.4s, v4.4s 
    zip1 v17.4s, v1.4s, v5.4s 
    zip1 v18.4s, v2.4s, v6.4s 
    zip1 v19.4s, v3.4s, v7.4s 

    ldr q8, [x3]
    add x3, x3, x6   
    ldr q9, [x3]
    add x3, x3, x6   
    ldr q10, [x3]
    add x3, x3, x6   
    ldr q11, [x3]
    add x3, x3, x6   


    ldr q12, [x3]
    add x3, x3, x6   
    ldr q13, [x3]
    add x3, x3, x6   
    ldr q14, [x3]
    add x3, x3, x6   
    ldr q15, [x3]
    add x3, x3, x6   

    zip1 v20.4s, v8.4s, v12.4s 
    zip1 v21.4s, v9.4s, v13.4s 
    zip1 v22.4s, v10.4s, v14.4s 
    zip1 v23.4s, v11.4s, v15.4s 

    zip1 v24.2d, v16.2d, v20.2d
    zip1 v25.2d, v17.2d, v21.2d
    zip1 v26.2d, v18.2d, v22.2d
    zip1 v27.2d, v19.2d, v23.2d

    stp q24, q25, [x4]
    stp q26, q27, [x4, #32]

    zip2 v24.2d, v16.2d, v20.2d
    zip2 v25.2d, v17.2d, v21.2d
    zip2 v26.2d, v18.2d, v22.2d
    zip2 v27.2d, v19.2d, v23.2d
    
    add x4, x4, x7
    stp q24, q25, [x4]
    stp q26, q27, [x4, #32]

    // zweite paare der geladenen vektoren

    zip2 v16.4s, v0.4s, v4.4s 
    zip2 v17.4s, v1.4s, v5.4s 
    zip2 v18.4s, v2.4s, v6.4s 
    zip2 v19.4s, v3.4s, v7.4s 

    zip2 v20.4s, v8.4s, v12.4s 
    zip2 v21.4s, v9.4s, v13.4s 
    zip2 v22.4s, v10.4s, v14.4s 
    zip2 v23.4s, v11.4s, v15.4s 

    zip1 v24.2d, v16.2d, v20.2d
    zip1 v25.2d, v17.2d, v21.2d
    zip1 v26.2d, v18.2d, v22.2d
    zip1 v27.2d, v19.2d, v23.2d

    add x4, x4, x7
    stp q24, q25, [x4]
    stp q26, q27, [x4, #32]

    zip2 v24.2d, v16.2d, v20.2d
    zip2 v25.2d, v17.2d, v21.2d
    zip2 v26.2d, v18.2d, v22.2d
    zip2 v27.2d, v19.2d, v23.2d
    
    add x4, x4, x7
    stp q24, q25, [x4]
    stp q26, q27, [x4, #32]

    // jetzt wurde a size 4 verarbeitet dann für die nächsten a
    add x10, x10, #4
    // bytes forward for a
    mul x11, x10, #4
    cmp x10, #8
    b.ne a_loop

    
    // jetzt wurde c size 4 verarbeitet damit kommt jetzt schleife für vielfache von 4 

    add x8, x8, #4
    // bytes forward for c
    mul x9, x8, x5
    cmp x8, x0
    b.ne c_loop

    ret

permutation_kernel c=4: GiB/s: 31.5954
permutation_kernel c=8: GiB/s: 32.2068