1+ #ifdef BASE64_NEON64_USE_ASM
2+ static inline void
3+ enc_loop_neon64_inner_asm (const uint8_t * * s , uint8_t * * o , const uint8x16x4_t tbl_enc )
4+ {
5+ // This function duplicates the functionality of enc_loop_neon64_inner,
6+ // but entirely with inline assembly. This gives a significant speedup
7+ // over using NEON intrinsics, which do not always generate very good
8+ // code. The logic of the assembly is directly lifted from the
9+ // intrinsics version, so it can be used as a guide to this code.
10+
11+ // Temporary registers, used as scratch space.
12+ uint8x16_t tmp0 , tmp1 , tmp2 , tmp3 ;
13+
14+ // Numeric constant.
15+ const uint8x16_t n63 = vdupq_n_u8 (63 );
16+
17+ __asm__ (
18+
19+ // Load 48 bytes and deinterleave. The bytes are loaded to
20+ // hard-coded registers v12, v13 and v14, to ensure that they
21+ // are contiguous. Increment the source pointer.
22+ "ld3 {v12.16b, v13.16b, v14.16b}, [%[src]], #48 \n\t"
23+
24+ // Reshuffle the bytes using temporaries.
25+ "ushr %[t0].16b, v12.16b, #2 \n\t"
26+ "ushr %[t1].16b, v13.16b, #4 \n\t"
27+ "ushr %[t2].16b, v14.16b, #6 \n\t"
28+ "sli %[t1].16b, v12.16b, #4 \n\t"
29+ "sli %[t2].16b, v13.16b, #2 \n\t"
30+ "and %[t1].16b, %[t1].16b, %[n63].16b \n\t"
31+ "and %[t2].16b, %[t2].16b, %[n63].16b \n\t"
32+ "and %[t3].16b, v14.16b, %[n63].16b \n\t"
33+
34+ // Translate the values to the Base64 alphabet.
35+ "tbl v12.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t0].16b \n\t"
36+ "tbl v13.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t1].16b \n\t"
37+ "tbl v14.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t2].16b \n\t"
38+ "tbl v15.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t3].16b \n\t"
39+
40+ // Store 64 bytes and interleave. Increment the dest pointer.
41+ "st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[dst]], #64 \n\t"
42+
43+ // Outputs (modified).
44+ : [src ] "+r" (* s ),
45+ [dst ] "+r" (* o ),
46+ [t0 ] "=&w" (tmp0 ),
47+ [t1 ] "=&w" (tmp1 ),
48+ [t2 ] "=&w" (tmp2 ),
49+ [t3 ] "=&w" (tmp3 )
50+
51+ // Inputs (not modified).
52+ : [n63 ] "w" (n63 ),
53+ [l0 ] "w" (tbl_enc .val [0 ]),
54+ [l1 ] "w" (tbl_enc .val [1 ]),
55+ [l2 ] "w" (tbl_enc .val [2 ]),
56+ [l3 ] "w" (tbl_enc .val [3 ])
57+
58+ // Clobbers.
59+ : "v12" , "v13" , "v14" , "v15"
60+ );
61+ }
62+ #endif
63+
164static inline void
265enc_loop_neon64_inner (const uint8_t * * s , uint8_t * * o , const uint8x16x4_t tbl_enc )
366{
67+ #ifdef BASE64_NEON64_USE_ASM
68+ enc_loop_neon64_inner_asm (s , o , tbl_enc );
69+ #else
470 // Load 48 bytes and deinterleave:
571 uint8x16x3_t src = vld3q_u8 (* s );
672
@@ -20,6 +86,7 @@ enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_en
2086
2187 * s += 48 ;
2288 * o += 64 ;
89+ #endif
2390}
2491
2592static inline void
0 commit comments