diff --git a/deps/base64/base64/defines.txt b/deps/base64/base64/defines.txt new file mode 100644 index 00000000000000..17e7303594dd43 --- /dev/null +++ b/deps/base64/base64/defines.txt @@ -0,0 +1,362 @@ +#define __DBL_MIN_EXP__ (-1021) +#define __LDBL_MANT_DIG__ 113 +#define STOR(A,B,C,D) "st4 {"A".16b, "B".16b, "C".16b, "D".16b}, [%[dst]], #64 \n\t" +#define ROUND_A() SHUF("v2", "v3", "v4") LOAD("v12", "v13", "v14") TRAN("v2", "v3", "v4", "v5") STOR("v2", "v3", "v4", "v5") +#define __UINT_LEAST16_MAX__ 0xffff +#define __ARM_SIZEOF_WCHAR_T 4 +#define __DBL_DECIMAL_DIG__ 17 +#define __ATOMIC_ACQUIRE 2 +#define __FLT128_MAX_10_EXP__ 4932 +#define __FLT_MIN__ 1.17549435082228750796873653722224568e-38F +#define __GCC_IEC_559_COMPLEX 2 +#define __UINT_LEAST8_TYPE__ unsigned char +#define __FLT128_DIG__ 33 +#define __INTMAX_C(c) c ## L +#define __CHAR_BIT__ 8 +#define __UINT8_MAX__ 0xff +#define __WCHAR_MAX__ 0xffffffffU +#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1 +#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1 +#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1 +#define __GCC_ATOMIC_CHAR_LOCK_FREE 2 +#define __GCC_IEC_559 2 +#define __FLT32X_DECIMAL_DIG__ 17 +#define __FLT_EVAL_METHOD__ 0 +#define HAVE_NEON64 1 +#define __FLT64_DECIMAL_DIG__ 17 +#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2 +#define __UINT_FAST32_TYPE__ long unsigned int +#define __UINT_FAST64_MAX__ 0xffffffffffffffffUL +#define __SIG_ATOMIC_TYPE__ int +#define __DBL_MIN_10_EXP__ (-307) +#define __FINITE_MATH_ONLY__ 0 +#define __FLT32X_MAX_EXP__ 1024 +#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1 +#define __GNUC_PATCHLEVEL__ 1 +#define __FLT32_HAS_DENORM__ 1 +#define __UINT_FAST8_MAX__ 0xff +#define __INT8_C(c) c +#define __INT_LEAST8_WIDTH__ 8 +#define __INTMAX_TYPE__ long int +#define __UINT_LEAST64_MAX__ 0xffffffffffffffffUL +#define __SHRT_MAX__ 0x7fff +#define __STDC_ISO_10646__ 201706L +#define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L +#define __ARM_FEATURE_IDIV 1 +#define __FLT64X_MAX_10_EXP__ 4932 +#define __ARM_FP 14 +#define __FLT64X_HAS_QUIET_NAN__ 1 +#define __WINT_TYPE__ unsigned int +#define __UINT_LEAST8_MAX__ 0xff +#define __FLT128_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966F128 +#define __UINTMAX_TYPE__ long unsigned int +#define _STDC_PREDEF_H 1 +#define __linux 1 +#define __FLT_EVAL_METHOD_TS_18661_3__ 0 +#define __CHAR_UNSIGNED__ 1 +#define __UINT32_MAX__ 0xffffffffU +#define __DBL_DENORM_MIN__ ((double)4.94065645841246544176568792868221372e-324L) +#define __AARCH64_CMODEL_SMALL__ 1 +#define __LDBL_MAX_EXP__ 16384 +#define __INT_FAST32_WIDTH__ 64 +#define __FLT128_MIN_EXP__ (-16381) +#define __FLT128_MIN_10_EXP__ (-4931) +#define __INT_LEAST16_WIDTH__ 16 +#define __FLT64X_MIN_EXP__ (-16381) +#define __SCHAR_MAX__ 0x7f +#define __FLT128_MANT_DIG__ 113 +#define __DBL_MAX__ ((double)1.79769313486231570814527423731704357e+308L) +#define __WCHAR_MIN__ 0U +#define __INT64_C(c) c ## L +#define __GCC_ATOMIC_POINTER_LOCK_FREE 2 +#define __SIZEOF_INT__ 4 +#define __INT_FAST64_WIDTH__ 64 +#define __PRAGMA_REDEFINE_EXTNAME 1 +#define __FLT32X_MANT_DIG__ 53 +#define __USER_LABEL_PREFIX__ +#define __FLT32_MAX_10_EXP__ 38 +#define __FLT64X_EPSILON__ 1.92592994438723585305597794258492732e-34F64x +#define __STDC_HOSTED__ 1 +#define __DBL_DIG__ 15 +#define __FLT32_DIG__ 6 +#define __FLT_EPSILON__ 1.19209289550781250000000000000000000e-7F +#define __SHRT_WIDTH__ 16 +#define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L +#define __STDC_UTF_16__ 1 +#define __FLT16_HAS_QUIET_NAN__ 1 +#define __ARM_SIZEOF_MINIMAL_ENUM 4 +#define __FLT64X_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966F64x +#define __FP_FAST_FMA 1 +#define __FLT32X_HAS_INFINITY__ 1 +#define __INT32_MAX__ 0x7fffffff +#define __FLT16_DIG__ 3 +#define __INT_WIDTH__ 32 +#define __SIZEOF_LONG__ 8 +#define __STDC_IEC_559__ 1 +#define __UINT16_C(c) c +#define __DECIMAL_DIG__ 36 +#define __STDC_IEC_559_COMPLEX__ 1 +#define __FLT64_EPSILON__ 2.22044604925031308084726333618164062e-16F64 +#define __gnu_linux__ 1 +#define __INT16_MAX__ 0x7fff +#define LOAD(A,B,C) "ld3 {"A".16b, "B".16b, "C".16b}, [%[src]], #48 \n\t" +#define __FLT64X_MIN_10_EXP__ (-4931) +#define __LDBL_HAS_QUIET_NAN__ 1 +#define __FLT16_MIN_EXP__ (-13) +#define __FLT64_MANT_DIG__ 53 +#define __FLT64X_MANT_DIG__ 113 +#define __GNUC__ 10 +#define __FLT_HAS_DENORM__ 1 +#define __SIZEOF_LONG_DOUBLE__ 16 +#define __LDBL_MIN_EXP__ (-16381) +#define __FLT64_MAX_10_EXP__ 308 +#define __FLT16_MAX_10_EXP__ 4 +#define __INT_FAST32_MAX__ 0x7fffffffffffffffL +#define __DBL_HAS_INFINITY__ 1 +#define __HAVE_SPECULATION_SAFE_VALUE 1 +#define __INTPTR_WIDTH__ 64 +#define __FLT32X_HAS_DENORM__ 1 +#define __INT_FAST16_TYPE__ long int +#define __LDBL_HAS_DENORM__ 1 +#define __FLT128_HAS_INFINITY__ 1 +#define __FLT32_DECIMAL_DIG__ 9 +#define __DBL_MAX_EXP__ 1024 +#define __WCHAR_WIDTH__ 32 +#define __FLT32_MAX__ 3.40282346638528859811704183484516925e+38F32 +#define __GCC_ATOMIC_LONG_LOCK_FREE 2 +#define __FLT16_DECIMAL_DIG__ 5 +#define __FLT32_HAS_QUIET_NAN__ 1 +#define __LONG_LONG_MAX__ 0x7fffffffffffffffLL +#define __SIZEOF_SIZE_T__ 8 +#define __SIG_ATOMIC_WIDTH__ 32 +#define __ARM_ALIGN_MAX_PWR 28 +#define __SIZEOF_WINT_T__ 4 +#define __LONG_LONG_WIDTH__ 64 +#define __FLT32_MAX_EXP__ 128 +#define __ARM_FP16_FORMAT_IEEE 1 +#define __FLT_MIN_EXP__ (-125) +#define __FLT64_NORM_MAX__ 1.79769313486231570814527423731704357e+308F64 +#define __GCC_HAVE_DWARF2_CFI_ASM 1 +#define __FLT32X_MIN_EXP__ (-1021) +#define __INT_FAST64_TYPE__ long int +#define __ARM_FP16_ARGS 1 +#define __FP_FAST_FMAF 1 +#define __FLT128_NORM_MAX__ 1.18973149535723176508575932662800702e+4932F128 +#define __FLT64_DENORM_MIN__ 4.94065645841246544176568792868221372e-324F64 +#define __DBL_MIN__ ((double)2.22507385850720138309023271733240406e-308L) +#define __ARM_FEATURE_CLZ 1 +#define __FLT16_DENORM_MIN__ 5.96046447753906250000000000000000000e-8F16 +#define __unix__ 1 +#define __FLT64X_NORM_MAX__ 1.18973149535723176508575932662800702e+4932F64x +#define __SIZEOF_POINTER__ 8 +#define __GXX_ABI_VERSION 1014 +#define __LP64__ 1 +#define __DBL_HAS_QUIET_NAN__ 1 +#define __FLT_EVAL_METHOD_C99__ 0 +#define __FLT32X_EPSILON__ 2.22044604925031308084726333618164062e-16F32x +#define __FLT64_MIN_EXP__ (-1021) +#define __UINT64_MAX__ 0xffffffffffffffffUL +#define __LDBL_DECIMAL_DIG__ 36 +#define __FLT_MAX__ 3.40282346638528859811704183484516925e+38F +#define __aarch64__ 1 +#define __FLT64_MIN_10_EXP__ (-307) +#define __FLT64X_DECIMAL_DIG__ 36 +#define __REGISTER_PREFIX__ +#define __UINT16_MAX__ 0xffff +#define __INTMAX_WIDTH__ 64 +#define __LDBL_HAS_INFINITY__ 1 +#define __FLT32_MIN__ 1.17549435082228750796873653722224568e-38F32 +#define __FLT_DIG__ 6 +#define __NO_INLINE__ 1 +#define __DEC_EVAL_METHOD__ 2 +#define __FLT_MANT_DIG__ 24 +#define __FLT16_MIN_10_EXP__ (-4) +#define __VERSION__ "10.3.1 20210621" +#define __UINT64_C(c) c ## UL +#define __WINT_MAX__ 0xffffffffU +#define __INT_LEAST32_MAX__ 0x7fffffff +#define __GCC_ATOMIC_INT_LOCK_FREE 2 +#define __FLT32X_MIN__ 2.22507385850720138309023271733240406e-308F32x +#define __FLT128_MAX_EXP__ 16384 +#define __FLT32_MANT_DIG__ 24 +#define __FLOAT_WORD_ORDER__ __ORDER_LITTLE_ENDIAN__ +#define __FLT16_MAX_EXP__ 16 +#define __BIGGEST_ALIGNMENT__ 16 +#define __INT32_C(c) c +#define __FLT128_HAS_DENORM__ 1 +#define __SCHAR_WIDTH__ 8 +#define __ORDER_PDP_ENDIAN__ 3412 +#define __ARM_64BIT_STATE 1 +#define __INT_FAST32_TYPE__ long int +#define ROUND() LOAD("v12", "v13", "v14") SHUF("v12", "v13", "v14") TRAN("v12", "v13", "v14", "v15") STOR("v12", "v13", "v14", "v15") +#define __UINT_LEAST16_TYPE__ short unsigned int +#define __SIZE_TYPE__ long unsigned int +#define __FLT64X_DIG__ 33 +#define __ARM_FEATURE_FMA 1 +#define __INT8_TYPE__ signed char +#define __ELF__ 1 +#define __GCC_ASM_FLAG_OUTPUTS__ 1 +#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1 +#define __FLT_RADIX__ 2 +#define __INT_LEAST16_TYPE__ short int +#define __ARM_ARCH_PROFILE 65 +#define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L +#define __UINTMAX_C(c) c ## UL +#define __ARM_PCS_AAPCS64 1 +#define __SIG_ATOMIC_MAX__ 0x7fffffff +#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2 +#define __SIZEOF_PTRDIFF_T__ 8 +#define __ATOMIC_RELAXED 0 +#define __LDBL_DIG__ 33 +#define __AARCH64EL__ 1 +#define __INT_FAST16_MAX__ 0x7fffffffffffffffL +#define __FLT64_DIG__ 15 +#define __UINT_FAST32_MAX__ 0xffffffffffffffffUL +#define __UINT_LEAST64_TYPE__ long unsigned int +#define __FLT16_EPSILON__ 9.76562500000000000000000000000000000e-4F16 +#define __FLT_HAS_QUIET_NAN__ 1 +#define __FLT_MAX_10_EXP__ 38 +#define __LONG_MAX__ 0x7fffffffffffffffL +#define ROUND_A_FIRST() LOAD("v2", "v3", "v4") ROUND_A() +#define __FLT64X_HAS_DENORM__ 1 +#define __FLT_HAS_INFINITY__ 1 +#define ROUND_B() SHUF("v12", "v13", "v14") LOAD("v2", "v3", "v4") TRAN("v12", "v13", "v14", "v15") STOR("v12", "v13", "v14", "v15") +#define __unix 1 +#define __DBL_HAS_DENORM__ 1 +#define __UINT_FAST16_TYPE__ long unsigned int +#define __FLT32X_HAS_QUIET_NAN__ 1 +#define __CHAR16_TYPE__ short unsigned int +#define __FLT64X_MAX_EXP__ 16384 +#define __SIZE_WIDTH__ 64 +#define __INT_LEAST16_MAX__ 0x7fff +#define __FLT16_NORM_MAX__ 6.55040000000000000000000000000000000e+4F16 +#define __INT64_MAX__ 0x7fffffffffffffffL +#define __FLT32_DENORM_MIN__ 1.40129846432481707092372958328991613e-45F32 +#define __INT_LEAST64_TYPE__ long int +#define __INT16_TYPE__ short int +#define __INT_LEAST8_TYPE__ signed char +#define __FLT16_MAX__ 6.55040000000000000000000000000000000e+4F16 +#define __STDC_VERSION__ 201710L +#define __INT_FAST8_MAX__ 0x7f +#define __ARM_ARCH 8 +#define __FLT128_MAX__ 1.18973149535723176508575932662800702e+4932F128 +#define __INTPTR_MAX__ 0x7fffffffffffffffL +#define linux 1 +#define __ARM_FEATURE_UNALIGNED 1 +#define __FLT64_HAS_QUIET_NAN__ 1 +#define __FLT32_MIN_10_EXP__ (-37) +#define __FLT32X_DIG__ 15 +#define __UINT8_TYPE__ unsigned char +#define __PTRDIFF_WIDTH__ 64 +#define __FLT64_HAS_INFINITY__ 1 +#define __FLT64X_MAX__ 1.18973149535723176508575932662800702e+4932F64x +#define __FLT16_HAS_INFINITY__ 1 +#define __SIG_ATOMIC_MIN__ (-__SIG_ATOMIC_MAX__ - 1) +#define __PTRDIFF_MAX__ 0x7fffffffffffffffL +#define __FLT16_MANT_DIG__ 11 +#define __INTPTR_TYPE__ long int +#define __UINT16_TYPE__ short unsigned int +#define __WCHAR_TYPE__ unsigned int +#define __UINTPTR_MAX__ 0xffffffffffffffffUL +#define __ARM_ARCH_8A 1 +#define __INT_FAST64_MAX__ 0x7fffffffffffffffL +#define __FLT_NORM_MAX__ 3.40282346638528859811704183484516925e+38F +#define __FLT32_HAS_INFINITY__ 1 +#define __UINT_FAST64_TYPE__ long unsigned int +#define __INT_MAX__ 0x7fffffff +#define __INT64_TYPE__ long int +#define __FLT_MAX_EXP__ 128 +#define __ORDER_BIG_ENDIAN__ 4321 +#define __DBL_MANT_DIG__ 53 +#define __INT_LEAST64_MAX__ 0x7fffffffffffffffL +#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2 +#define __FP_FAST_FMAF32 1 +#define __UINT_LEAST32_TYPE__ unsigned int +#define __SIZEOF_SHORT__ 2 +#define __FLT32_NORM_MAX__ 3.40282346638528859811704183484516925e+38F32 +#define __GCC_ATOMIC_BOOL_LOCK_FREE 2 +#define __FLT64_MAX__ 1.79769313486231570814527423731704357e+308F64 +#define __WINT_WIDTH__ 32 +#define __FP_FAST_FMAF64 1 +#define __INT_LEAST8_MAX__ 0x7f +#define __INT_LEAST64_WIDTH__ 64 +#define __FLT32X_MAX_10_EXP__ 308 +#define __SIZEOF_INT128__ 16 +#define __FLT16_MIN__ 6.10351562500000000000000000000000000e-5F16 +#define __LDBL_MAX_10_EXP__ 4932 +#define __DBL_EPSILON__ ((double)2.22044604925031308084726333618164062e-16L) +#define __FLT32_MIN_EXP__ (-125) +#define __FLT128_MIN__ 3.36210314311209350626267781732175260e-4932F128 +#define _LP64 1 +#define __UINT8_C(c) c +#define __FLT64_MAX_EXP__ 1024 +#define __INT_LEAST32_TYPE__ int +#define __UINT64_TYPE__ long unsigned int +#define __ARM_NEON 1 +#define __FLT128_HAS_QUIET_NAN__ 1 +#define __INTMAX_MAX__ 0x7fffffffffffffffL +#define __UINT_FAST8_TYPE__ unsigned char +#define __INT_FAST8_TYPE__ signed char +#define __FLT64X_MIN__ 3.36210314311209350626267781732175260e-4932F64x +#define __GNUC_STDC_INLINE__ 1 +#define __FLT64_HAS_DENORM__ 1 +#define __FLT32_EPSILON__ 1.19209289550781250000000000000000000e-7F32 +#define __FP_FAST_FMAF32x 1 +#define __FLT16_HAS_DENORM__ 1 +#define __STDC_UTF_32__ 1 +#define __INT_FAST8_WIDTH__ 8 +#define __FLT32X_MAX__ 1.79769313486231570814527423731704357e+308F32x +#define __DBL_NORM_MAX__ ((double)1.79769313486231570814527423731704357e+308L) +#define __FLT64X_HAS_INFINITY__ 1 +#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ +#define __ARM_ALIGN_MAX_STACK_PWR 16 +#define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L +#define __SIZEOF_WCHAR_T__ 4 +#define unix 1 +#define __UINT32_C(c) c ## U +#define __FLT_DENORM_MIN__ 1.40129846432481707092372958328991613e-45F +#define __WINT_MIN__ 0U +#define __INT8_MAX__ 0x7f +#define __LONG_WIDTH__ 64 +#define __FLT32X_NORM_MAX__ 1.79769313486231570814527423731704357e+308F32x +#define __CHAR32_TYPE__ unsigned int +#define __ARM_FEATURE_NUMERIC_MAXMIN 1 +#define __INT32_TYPE__ int +#define __SIZEOF_DOUBLE__ 8 +#define __FLT_MIN_10_EXP__ (-37) +#define __FLT64_MIN__ 2.22507385850720138309023271733240406e-308F64 +#define __INT_LEAST32_WIDTH__ 32 +#define __SIZEOF_FLOAT__ 4 +#define __ATOMIC_CONSUME 1 +#define __GNUC_MINOR__ 3 +#define __INT_FAST16_WIDTH__ 64 +#define __UINTMAX_MAX__ 0xffffffffffffffffUL +#define __FLT32X_DENORM_MIN__ 4.94065645841246544176568792868221372e-324F32x +#define SHUF(A,B,C) "ushr %[t0].16b, "A".16b, #2 \n\t" "ushr %[t1].16b, "B".16b, #4 \n\t" "ushr %[t2].16b, "C".16b, #6 \n\t" "sli %[t1].16b, "A".16b, #4 \n\t" "sli %[t2].16b, "B".16b, #2 \n\t" "and %[t1].16b, %[t1].16b, %[n63].16b \n\t" "and %[t2].16b, %[t2].16b, %[n63].16b \n\t" "and %[t3].16b, "C".16b, %[n63].16b \n\t" +#define __DBL_MAX_10_EXP__ 308 +#define __INT16_C(c) c +#define __ARM_ARCH_ISA_A64 1 +#define __STDC__ 1 +#define __PTRDIFF_TYPE__ long int +#define TRAN(A,B,C,D) "tbl "A".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" "tbl "B".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" "tbl "C".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" "tbl "D".16b, {v8.16b-v11.16b}, %[t3].16b \n\t" +#define __ATOMIC_SEQ_CST 5 +#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 1 +#define ROUND_B_LAST() SHUF("v12", "v13", "v14") TRAN("v12", "v13", "v14", "v15") STOR("v12", "v13", "v14", "v15") +#define __UINT32_TYPE__ unsigned int +#define __FLT32X_MIN_10_EXP__ (-307) +#define __UINTPTR_TYPE__ long unsigned int +#define __linux__ 1 +#define __LDBL_MIN_10_EXP__ (-4931) +#define __FLT128_EPSILON__ 1.92592994438723585305597794258492732e-34F128 +#define __SIZEOF_LONG_LONG__ 8 +#define __FLT128_DECIMAL_DIG__ 36 +#define __GCC_ATOMIC_LLONG_LOCK_FREE 2 +#define __FLT_DECIMAL_DIG__ 9 +#define __UINT_FAST16_MAX__ 0xffffffffffffffffUL +#define __LDBL_NORM_MAX__ 1.18973149535723176508575932662800702e+4932L +#define __GCC_ATOMIC_SHORT_LOCK_FREE 2 +#define __ORDER_LITTLE_ENDIAN__ 1234 +#define __SIZE_MAX__ 0xffffffffffffffffUL +#define __UINT_LEAST32_MAX__ 0xffffffffU +#define __ATOMIC_ACQ_REL 4 +#define __ATOMIC_RELEASE 3 diff --git a/deps/base64/base64/lib/arch/neon64/codec.c b/deps/base64/base64/lib/arch/neon64/codec.c index fc953b23e7f9b2..79789bb7d0ecbc 100644 --- a/deps/base64/base64/lib/arch/neon64/codec.c +++ b/deps/base64/base64/lib/arch/neon64/codec.c @@ -58,8 +58,13 @@ load_64byte_table (const uint8_t *p) #include "../generic/32/dec_loop.c" #include "../generic/64/enc_loop.c" #include "dec_loop.c" -#include "enc_reshuffle.c" -#include "enc_loop.c" + +#ifdef BASE64_NEON64_USE_ASM +# include "enc_loop_asm.c" +#else +# include "enc_reshuffle.c" +# include "enc_loop.c" +#endif #endif // BASE64_USE_NEON64 diff --git a/deps/base64/base64/lib/arch/neon64/enc_loop.c b/deps/base64/base64/lib/arch/neon64/enc_loop.c index d1862f7a3aadf2..59a1c59728a139 100644 --- a/deps/base64/base64/lib/arch/neon64/enc_loop.c +++ b/deps/base64/base64/lib/arch/neon64/enc_loop.c @@ -1,72 +1,6 @@ -#ifdef BASE64_NEON64_USE_ASM -static inline void -enc_loop_neon64_inner_asm (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc) -{ - // This function duplicates the functionality of enc_loop_neon64_inner, - // but entirely with inline assembly. This gives a significant speedup - // over using NEON intrinsics, which do not always generate very good - // code. The logic of the assembly is directly lifted from the - // intrinsics version, so it can be used as a guide to this code. - - // Temporary registers, used as scratch space. - uint8x16_t tmp0, tmp1, tmp2, tmp3; - - // Numeric constant. - const uint8x16_t n63 = vdupq_n_u8(63); - - __asm__ ( - - // Load 48 bytes and deinterleave. The bytes are loaded to - // hard-coded registers v12, v13 and v14, to ensure that they - // are contiguous. Increment the source pointer. - "ld3 {v12.16b, v13.16b, v14.16b}, [%[src]], #48 \n\t" - - // Reshuffle the bytes using temporaries. - "ushr %[t0].16b, v12.16b, #2 \n\t" - "ushr %[t1].16b, v13.16b, #4 \n\t" - "ushr %[t2].16b, v14.16b, #6 \n\t" - "sli %[t1].16b, v12.16b, #4 \n\t" - "sli %[t2].16b, v13.16b, #2 \n\t" - "and %[t1].16b, %[t1].16b, %[n63].16b \n\t" - "and %[t2].16b, %[t2].16b, %[n63].16b \n\t" - "and %[t3].16b, v14.16b, %[n63].16b \n\t" - - // Translate the values to the Base64 alphabet. - "tbl v12.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t0].16b \n\t" - "tbl v13.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t1].16b \n\t" - "tbl v14.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t2].16b \n\t" - "tbl v15.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t3].16b \n\t" - - // Store 64 bytes and interleave. Increment the dest pointer. - "st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[dst]], #64 \n\t" - - // Outputs (modified). - : [src] "+r" (*s), - [dst] "+r" (*o), - [t0] "=&w" (tmp0), - [t1] "=&w" (tmp1), - [t2] "=&w" (tmp2), - [t3] "=&w" (tmp3) - - // Inputs (not modified). - : [n63] "w" (n63), - [l0] "w" (tbl_enc.val[0]), - [l1] "w" (tbl_enc.val[1]), - [l2] "w" (tbl_enc.val[2]), - [l3] "w" (tbl_enc.val[3]) - - // Clobbers. - : "v12", "v13", "v14", "v15" - ); -} -#endif - static inline void enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc) { -#ifdef BASE64_NEON64_USE_ASM - enc_loop_neon64_inner_asm(s, o, tbl_enc); -#else // Load 48 bytes and deinterleave: uint8x16x3_t src = vld3q_u8(*s); @@ -86,7 +20,6 @@ enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_en *s += 48; *o += 64; -#endif } static inline void diff --git a/deps/base64/base64/lib/arch/neon64/enc_loop_asm.c b/deps/base64/base64/lib/arch/neon64/enc_loop_asm.c new file mode 100644 index 00000000000000..cf2fd27e80d2ca --- /dev/null +++ b/deps/base64/base64/lib/arch/neon64/enc_loop_asm.c @@ -0,0 +1,167 @@ +// Apologies in advance for combining the preprocessor with inline assembly, +// two notoriously gnarly parts of C, but it was necessary to avoid a lot of +// code repetition. The preprocessor is used to template large sections of +// inline assembly that differ only in the registers used. If the code was +// written out by hand, it would become very large and hard to audit. + +// Generate a block of inline assembly that loads three user-defined registers +// A, B, C from memory and deinterleaves them, post-incrementing the src +// pointer. The register set should be sequential. +#define LOAD(A, B, C) \ + "ld3 {"A".16b, "B".16b, "C".16b}, [%[src]], #48 \n\t" + +// Generate a block of inline assembly that takes three deinterleaved registers +// and shuffles the bytes. The output is in temporary registers t0..t3. +#define SHUF(A, B, C) \ + "ushr %[t0].16b, "A".16b, #2 \n\t" \ + "ushr %[t1].16b, "B".16b, #4 \n\t" \ + "ushr %[t2].16b, "C".16b, #6 \n\t" \ + "sli %[t1].16b, "A".16b, #4 \n\t" \ + "sli %[t2].16b, "B".16b, #2 \n\t" \ + "and %[t1].16b, %[t1].16b, %[n63].16b \n\t" \ + "and %[t2].16b, %[t2].16b, %[n63].16b \n\t" \ + "and %[t3].16b, "C".16b, %[n63].16b \n\t" + +// Generate a block of inline assembly that takes temporary registers t0..t3 +// and translates them to the base64 alphabet, using a table loaded into +// v8..v11. The output is in user-defined registers A..D. +#define TRAN(A, B, C, D) \ + "tbl "A".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" \ + "tbl "B".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" \ + "tbl "C".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" \ + "tbl "D".16b, {v8.16b-v11.16b}, %[t3].16b \n\t" + +// Generate a block of inline assembly that interleaves four registers and +// stores them, post-incrementing the destination pointer. +#define STOR(A, B, C, D) \ + "st4 {"A".16b, "B".16b, "C".16b, "D".16b}, [%[dst]], #64 \n\t" + +// Generate a block of inline assembly that generates a single self-contained +// encoder round: fetch the data, process it, and store the result. +#define ROUND() \ + LOAD("v12", "v13", "v14") \ + SHUF("v12", "v13", "v14") \ + TRAN("v12", "v13", "v14", "v15") \ + STOR("v12", "v13", "v14", "v15") + +// Generate a block of assembly that generates a type A interleaved encoder +// round. It uses registers that were loaded by the previous type B round, and +// in turn loads registers for the next type B round. +#define ROUND_A() \ + SHUF("v2", "v3", "v4") \ + LOAD("v12", "v13", "v14") \ + TRAN("v2", "v3", "v4", "v5") \ + STOR("v2", "v3", "v4", "v5") + +// Type B interleaved encoder round. Same as type A, but register sets swapped. +#define ROUND_B() \ + SHUF("v12", "v13", "v14") \ + LOAD("v2", "v3", "v4") \ + TRAN("v12", "v13", "v14", "v15") \ + STOR("v12", "v13", "v14", "v15") + +// The first type A round needs to load its own registers. +#define ROUND_A_FIRST() \ + LOAD("v2", "v3", "v4") \ + ROUND_A() + +// The last type B round omits the load for the next step. +#define ROUND_B_LAST() \ + SHUF("v12", "v13", "v14") \ + TRAN("v12", "v13", "v14", "v15") \ + STOR("v12", "v13", "v14", "v15") + +// Suppress clang's warning that the literal string in the asm statement is +// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99 +// compilers). It may be true, but the goal here is not C99 portability. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Woverlength-strings" + +static inline void +enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + size_t rounds = *slen / 48; + + if (rounds == 0) { + return; + } + + *slen -= rounds * 48; // 48 bytes consumed per round. + *olen += rounds * 64; // 64 bytes produced per round. + + // Number of times to go through the 8x loop. + size_t loops = rounds / 8; + + // Number of rounds remaining after the 8x loop. + rounds %= 8; + + // Temporary registers, used as scratch space. + uint8x16_t tmp0, tmp1, tmp2, tmp3; + + __asm__ volatile ( + + // Load the encoding table into v8..v11. + " ld1 {v8.16b-v11.16b}, [%[tbl]] \n\t" + + // If there are eight rounds or more, enter an 8x unrolled loop + // of interleaved encoding rounds. The rounds interleave memory + // operations (load/store) with data operations to maximize + // pipeline throughput. + " cbz %[loops], 4f \n\t" + + // The SIMD instructions do not touch the flags. + "88: subs %[loops], %[loops], #1 \n\t" + " " ROUND_A_FIRST() + " " ROUND_B() + " " ROUND_A() + " " ROUND_B() + " " ROUND_A() + " " ROUND_B() + " " ROUND_A() + " " ROUND_B_LAST() + " b.ne 88b \n\t" + + // Enter a 4x unrolled loop for rounds of 4 or more. + "4: cmp %[rounds], #4 \n\t" + " b.lt 30f \n\t" + " " ROUND_A_FIRST() + " " ROUND_B() + " " ROUND_A() + " " ROUND_B_LAST() + " sub %[rounds], %[rounds], #4 \n\t" + + // Dispatch the remaining rounds 0..3. + "30: cbz %[rounds], 0f \n\t" + " cmp %[rounds], #2 \n\t" + " b.eq 2f \n\t" + " b.lt 1f \n\t" + + // Block of non-interlaced encoding rounds, which can each + // individually be jumped to. Rounds fall through to the next. + "3: " ROUND() + "2: " ROUND() + "1: " ROUND() + "0: \n\t" + + // Outputs (modified). + : [loops] "+r" (loops), + [src] "+r" (*s), + [dst] "+r" (*o), + [t0] "=&w" (tmp0), + [t1] "=&w" (tmp1), + [t2] "=&w" (tmp2), + [t3] "=&w" (tmp3) + + // Inputs (not modified). + : [rounds] "r" (rounds), + [tbl] "r" (base64_table_enc_6bit), + [n63] "w" (vdupq_n_u8(63)) + + // Clobbers. + : "v2", "v3", "v4", "v5", + "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15" + ); +} + +#pragma GCC diagnostic pop