Week 2
In the second week we implemented microbenchmarks to measure the execution throughput in form of the floating point operations executed per second of the following instructions:
FMADD (scalar), FP32 variant.
FMLA (vector) with arrangement specifier 4S.
FMLA (vector) with arrangement specifier 2S.
Furthermore we implemented a kernel, which performs a permutation operation on a tensor abc of the form abc -> cba. The a and b dimensions were fixed to 8 and 4 respectively, while the c dimensions was allowed to vary.
Execution Throughput
For each of the given instructions, we generally execute the same instruction repeatedly and
measure the total time taken.
If we let i denote the total number of instructions executed, t the required execution time
and f the floating point operations executed per instruction, then the floating point operations z
executed per second of the benchmark may be determined as follows:
z = (i * f) / t
The functions fmadd_kernel, fmla_4s_kernel and fmla_2s_kernel execute
their respective instruction repeatedly as described.
However, these functions do not execute their respective instructions at the
maximal possible rate.
This is because, in these functions, each instruction depends in it’s arguments
on the results of it’s preceding instruction.
This prevents the CPU from fully utilizing the instruction pipelines.
The functions fmadd_kernel_v2, fmla_4s_kernel_v2 and fmla_2s_kernel_v2
avoid this issue.
We obtained the following results when running the benchmarks on the provided Raspberry Pi machines.
fmadd_kernel: 1.12 GFlopsfmla_4s_kernel: 9.59 GFlopsfmla_2s_kernel: 4.80 GFlopsfmadd_kernel_v2: 9.59 GFlopsfmla_4s_kernel_v2: 38.37 GFlopsfmla_2s_kernel_v2: 19.17 GFlops
Permutation
.text
// @brief Permutation operation abc->cba
// @param size_c Size of dimension c.
// @param abc Pointer to row-major tensor abc.
// @param cba Pointer to row-major tensor cba.
// void perm_neon_abc_cba(int64_t size_c
// float const * abc,
// float * cba);
.global perm_neon_abc_cba_a4
perm_neon_abc_cba_a4:
// c stride
lsl x5, x0, #4
// b stride
lsl x6, x0, #2
// loop counter
mov x8, xzr
// c*b stride
mul x7, x5, x6
c_loop:
// local pointer
mov x3, x1
mov x4, x2
//move in c dimension
add x3, x3, x9
add x4, x4, x9
ldr q0, [x3]
add x3, x3, x6
ldr q1, [x3]
add x3, x3, x6
ldr q2, [x3]
add x3, x3, x6
ldr q3, [x3]
add x3, x3, x6
ldr q4, [x3]
add x3, x3, x6
ldr q5, [x3]
add x3, x3, x6
ldr q6, [x3]
add x3, x3, x6
ldr q7, [x3]
add x3, x3, x6
zip1 v16.4s, v0.4s, v4.4s
zip1 v17.4s, v1.4s, v5.4s
zip1 v18.4s, v2.4s, v6.4s
zip1 v19.4s, v3.4s, v7.4s
ldr q8, [x3]
add x3, x3, x6
ldr q9, [x3]
add x3, x3, x6
ldr q10, [x3]
add x3, x3, x6
ldr q11, [x3]
add x3, x3, x6
ldr q12, [x3]
add x3, x3, x6
ldr q13, [x3]
add x3, x3, x6
ldr q14, [x3]
add x3, x3, x6
ldr q15, [x3]
add x3, x3, x6
zip1 v20.4s, v8.4s, v12.4s
zip1 v21.4s, v9.4s, v13.4s
zip1 v22.4s, v10.4s, v14.4s
zip1 v23.4s, v11.4s, v15.4s
zip1 v24.2d, v16.2d, v20.2d
zip1 v25.2d, v17.2d, v21.2d
zip1 v26.2d, v18.2d, v22.2d
zip1 v27.2d, v19.2d, v23.2d
// stp q24, q25, [x4]
// stp q26, q27, [x4, #32*2]
str q24, [x4]
str q25, [x4, #4*4*2]
str q25, [x4, #4*4*4]
str q25, [x4, #4*4*6]
zip2 v24.2d, v16.2d, v20.2d
zip2 v25.2d, v17.2d, v21.2d
zip2 v26.2d, v18.2d, v22.2d
zip2 v27.2d, v19.2d, v23.2d
add x4, x4, x7
// stp q24, q25, [x4]
// stp q26, q27, [x4, #32]
str q24, [x4]
str q25, [x4, #4*4*2]
str q25, [x4, #4*4*4]
str q25, [x4, #4*4*6]
// zweite paare der geladenen vektoren
zip2 v16.4s, v0.4s, v4.4s
zip2 v17.4s, v1.4s, v5.4s
zip2 v18.4s, v2.4s, v6.4s
zip2 v19.4s, v3.4s, v7.4s
zip2 v20.4s, v8.4s, v12.4s
zip2 v21.4s, v9.4s, v13.4s
zip2 v22.4s, v10.4s, v14.4s
zip2 v23.4s, v11.4s, v15.4s
zip1 v24.2d, v16.2d, v20.2d
zip1 v25.2d, v17.2d, v21.2d
zip1 v26.2d, v18.2d, v22.2d
zip1 v27.2d, v19.2d, v23.2d
add x4, x4, x7
stp q24, q25, [x4]
stp q26, q27, [x4, #32]
zip2 v24.2d, v16.2d, v20.2d
zip2 v25.2d, v17.2d, v21.2d
zip2 v26.2d, v18.2d, v22.2d
zip2 v27.2d, v19.2d, v23.2d
add x4, x4, x7
stp q24, q25, [x4]
stp q26, q27, [x4, #32]
// jetzte wurde c size 4 verarbeitet damit kommt jetzt schleife für vielfache von 4
add x8, x8, #4
mul x9, x8, x5
cmp x8, x0
b.ne c_loop
ret
.global perm_neon_abc_cba
perm_neon_abc_cba:
// c stride
lsl x5, x0, #4
// b stride
lsl x6, x0, #2
mov x8, xzr
mov x10, xzr
// c*b stride
mul x7, x5, x6
// loop counter
c_loop:
// local pointer
mov x3, x1
mov x4, x2
//move in c dimension
add x3, x3, x9
add x4, x4, x9
a_loop:
ldr q0, [x3]
add x3, x3, x6
ldr q1, [x3]
add x3, x3, x6
ldr q2, [x3]
add x3, x3, x6
ldr q3, [x3]
add x3, x3, x6
ldr q4, [x3]
add x3, x3, x6
ldr q5, [x3]
add x3, x3, x6
ldr q6, [x3]
add x3, x3, x6
ldr q7, [x3]
add x3, x3, x6
zip1 v16.4s, v0.4s, v4.4s
zip1 v17.4s, v1.4s, v5.4s
zip1 v18.4s, v2.4s, v6.4s
zip1 v19.4s, v3.4s, v7.4s
ldr q8, [x3]
add x3, x3, x6
ldr q9, [x3]
add x3, x3, x6
ldr q10, [x3]
add x3, x3, x6
ldr q11, [x3]
add x3, x3, x6
ldr q12, [x3]
add x3, x3, x6
ldr q13, [x3]
add x3, x3, x6
ldr q14, [x3]
add x3, x3, x6
ldr q15, [x3]
add x3, x3, x6
zip1 v20.4s, v8.4s, v12.4s
zip1 v21.4s, v9.4s, v13.4s
zip1 v22.4s, v10.4s, v14.4s
zip1 v23.4s, v11.4s, v15.4s
zip1 v24.2d, v16.2d, v20.2d
zip1 v25.2d, v17.2d, v21.2d
zip1 v26.2d, v18.2d, v22.2d
zip1 v27.2d, v19.2d, v23.2d
stp q24, q25, [x4]
stp q26, q27, [x4, #32]
zip2 v24.2d, v16.2d, v20.2d
zip2 v25.2d, v17.2d, v21.2d
zip2 v26.2d, v18.2d, v22.2d
zip2 v27.2d, v19.2d, v23.2d
add x4, x4, x7
stp q24, q25, [x4]
stp q26, q27, [x4, #32]
// zweite paare der geladenen vektoren
zip2 v16.4s, v0.4s, v4.4s
zip2 v17.4s, v1.4s, v5.4s
zip2 v18.4s, v2.4s, v6.4s
zip2 v19.4s, v3.4s, v7.4s
zip2 v20.4s, v8.4s, v12.4s
zip2 v21.4s, v9.4s, v13.4s
zip2 v22.4s, v10.4s, v14.4s
zip2 v23.4s, v11.4s, v15.4s
zip1 v24.2d, v16.2d, v20.2d
zip1 v25.2d, v17.2d, v21.2d
zip1 v26.2d, v18.2d, v22.2d
zip1 v27.2d, v19.2d, v23.2d
add x4, x4, x7
stp q24, q25, [x4]
stp q26, q27, [x4, #32]
zip2 v24.2d, v16.2d, v20.2d
zip2 v25.2d, v17.2d, v21.2d
zip2 v26.2d, v18.2d, v22.2d
zip2 v27.2d, v19.2d, v23.2d
add x4, x4, x7
stp q24, q25, [x4]
stp q26, q27, [x4, #32]
// jetzt wurde a size 4 verarbeitet dann für die nächsten a
add x10, x10, #4
// bytes forward for a
mul x11, x10, #4
cmp x10, #8
b.ne a_loop
// jetzt wurde c size 4 verarbeitet damit kommt jetzt schleife für vielfache von 4
add x8, x8, #4
// bytes forward for c
mul x9, x8, x5
cmp x8, x0
b.ne c_loop
ret
permutation_kernel c=4: GiB/s: 31.5954permutation_kernel c=8: GiB/s: 32.2068