Skip to content

Commit 13a9e81

Browse files
nodejs-github-botrichardlau
authored andcommittedMar 20, 2024
deps: update base64 to 0.5.1
PR-URL: #50629 Fixes: #50561 Fixes: #45091 Reviewed-By: Luigi Pinca <luigipinca@gmail.com> Reviewed-By: Mohammed Keyvanzadeh <mohammadkeyvanzade94@gmail.com> Reviewed-By: Yagiz Nizipli <yagiz@nizipli.com> Reviewed-By: Richard Lau <rlau@redhat.com>
1 parent b4502d3 commit 13a9e81

31 files changed

+1684
-135
lines changed
 

‎deps/base64/base64.gyp

+27
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,15 @@
4646
'HAVE_SSE42=1',
4747
'HAVE_AVX=1',
4848
'HAVE_AVX2=1',
49+
'HAVE_AVX512=1',
4950
],
5051
'dependencies': [
5152
'base64_ssse3',
5253
'base64_sse41',
5354
'base64_sse42',
5455
'base64_avx',
5556
'base64_avx2',
57+
'base64_avx512',
5658
],
5759
}, {
5860
'sources': [
@@ -61,6 +63,7 @@
6163
'base64/lib/arch/sse42/codec.c',
6264
'base64/lib/arch/avx/codec.c',
6365
'base64/lib/arch/avx2/codec.c',
66+
'base64/lib/arch/avx512/codec.c',
6467
],
6568
}],
6669
],
@@ -162,6 +165,30 @@
162165
],
163166
},
164167

168+
{
169+
'target_name': 'base64_avx512',
170+
'type': 'static_library',
171+
'include_dirs': [ 'base64/include', 'base64/lib' ],
172+
'sources': [ 'base64/lib/arch/avx512/codec.c' ],
173+
'defines': [ 'BASE64_STATIC_DEFINE', 'HAVE_AVX512=1' ],
174+
'conditions': [
175+
[ 'OS!="win"', {
176+
'cflags': [ '-mavx512vl', '-mavx512vbmi' ],
177+
'xcode_settings': {
178+
'OTHER_CFLAGS': [ '-mavx512vl', '-mavx512vbmi' ]
179+
},
180+
}, {
181+
'msvs_settings': {
182+
'VCCLCompilerTool': {
183+
'AdditionalOptions': [
184+
'/arch:AVX512'
185+
],
186+
},
187+
},
188+
}],
189+
],
190+
},
191+
165192
{
166193
'target_name': 'base64_neon32',
167194
'type': 'static_library',

‎deps/base64/base64/.gitignore

+1-12
Original file line numberDiff line numberDiff line change
@@ -1,12 +1 @@
1-
*.o
2-
bin/base64
3-
lib/config.h
4-
test/benchmark
5-
test/test_base64
6-
7-
# visual studio symbol db, etc.
8-
.vs/
9-
# build directory used by CMakePresets
10-
out/
11-
# private cmake presets
12-
CMakeUserPresets.json
1+
# Intentionally empty

‎deps/base64/base64/CMakeLists.txt

+5-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ if (POLICY CMP0127)
1717
cmake_policy(SET CMP0127 NEW)
1818
endif()
1919

20-
project(base64 LANGUAGES C VERSION 0.5.0)
20+
project(base64 LANGUAGES C VERSION 0.5.1)
2121

2222
include(GNUInstallDirs)
2323
include(CMakeDependentOption)
@@ -62,6 +62,8 @@ cmake_dependent_option(BASE64_WITH_AVX "add AVX codepath" ON ${_IS_X86} OFF)
6262
add_feature_info(AVX BASE64_WITH_AVX "add AVX codepath")
6363
cmake_dependent_option(BASE64_WITH_AVX2 "add AVX 2 codepath" ON ${_IS_X86} OFF)
6464
add_feature_info(AVX2 BASE64_WITH_AVX2 "add AVX2 codepath")
65+
cmake_dependent_option(BASE64_WITH_AVX512 "add AVX 512 codepath" ON ${_IS_X86} OFF)
66+
add_feature_info(AVX512 BASE64_WITH_AVX512 "add AVX512 codepath")
6567

6668
cmake_dependent_option(BASE64_WITH_NEON32 "add NEON32 codepath" OFF _TARGET_ARCH_arm OFF)
6769
add_feature_info(NEON32 BASE64_WITH_NEON32 "add NEON32 codepath")
@@ -118,6 +120,7 @@ add_library(base64
118120
lib/arch/sse42/codec.c
119121
lib/arch/avx/codec.c
120122
lib/arch/avx2/codec.c
123+
lib/arch/avx512/codec.c
121124

122125
lib/arch/neon32/codec.c
123126
lib/arch/neon64/codec.c
@@ -206,6 +209,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64")
206209
configure_codec(SSE42 __SSSE4_2__)
207210
configure_codec(AVX)
208211
configure_codec(AVX2)
212+
configure_codec(AVX512)
209213

210214
elseif (_TARGET_ARCH STREQUAL "arm")
211215
set(BASE64_NEON32_CFLAGS "${COMPILE_FLAGS_NEON32}" CACHE STRING "the NEON32 compile flags (for 'lib/arch/neon32/codec.c')")

‎deps/base64/base64/LICENSE

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Copyright (c) 2005-2007, Nick Galbreath
2-
Copyright (c) 2013-2019, Alfred Klomp
3-
Copyright (c) 2015-2017, Wojciech Mula
2+
Copyright (c) 2015-2018, Wojciech Muła
43
Copyright (c) 2016-2017, Matthieu Darbois
4+
Copyright (c) 2013-2022, Alfred Klomp
55
All rights reserved.
66

77
Redistribution and use in source and binary forms, with or without

‎deps/base64/base64/Makefile

+8-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ CFLAGS += -std=c99 -O3 -Wall -Wextra -pedantic
44
OBJCOPY ?= objcopy
55

66
OBJS = \
7+
lib/arch/avx512/codec.o \
78
lib/arch/avx2/codec.o \
89
lib/arch/generic/codec.o \
910
lib/arch/neon32/codec.o \
@@ -16,6 +17,7 @@ OBJS = \
1617
lib/codec_choose.o \
1718
lib/tables/tables.o
1819

20+
HAVE_AVX512 = 0
1921
HAVE_AVX2 = 0
2022
HAVE_NEON32 = 0
2123
HAVE_NEON64 = 0
@@ -26,6 +28,9 @@ HAVE_AVX = 0
2628

2729
# The user should supply compiler flags for the codecs they want to build.
2830
# Check which codecs we're going to include:
31+
ifdef AVX512_CFLAGS
32+
HAVE_AVX512 = 1
33+
endif
2934
ifdef AVX2_CFLAGS
3035
HAVE_AVX2 = 1
3136
endif
@@ -64,7 +69,8 @@ lib/libbase64.o: $(OBJS)
6469
$(OBJCOPY) --keep-global-symbols=lib/exports.txt $@
6570

6671
lib/config.h:
67-
@echo "#define HAVE_AVX2 $(HAVE_AVX2)" > $@
72+
@echo "#define HAVE_AVX512 $(HAVE_AVX512)" > $@
73+
@echo "#define HAVE_AVX2 $(HAVE_AVX2)" >> $@
6874
@echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@
6975
@echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@
7076
@echo "#define HAVE_SSSE3 $(HAVE_SSSE3)" >> $@
@@ -75,6 +81,7 @@ lib/config.h:
7581
$(OBJS): lib/config.h
7682
$(OBJS): CFLAGS += -Ilib
7783

84+
lib/arch/avx512/codec.o: CFLAGS += $(AVX512_CFLAGS)
7885
lib/arch/avx2/codec.o: CFLAGS += $(AVX2_CFLAGS)
7986
lib/arch/neon32/codec.o: CFLAGS += $(NEON32_CFLAGS)
8087
lib/arch/neon64/codec.o: CFLAGS += $(NEON64_CFLAGS)

‎deps/base64/base64/README.md

+21-4
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
[![Build Status](https://github.com/aklomp/base64/actions/workflows/test.yml/badge.svg)](https://github.com/aklomp/base64/actions/workflows/test.yml)
44

55
This is an implementation of a base64 stream encoding/decoding library in C99
6-
with SIMD (AVX2, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
6+
with SIMD (AVX2, AVX512, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
77
[OpenMP](http://www.openmp.org) acceleration. It also contains wrapper functions
88
to encode/decode simple length-delimited strings. This library aims to be:
99

@@ -19,6 +19,10 @@ will pick an optimized codec that lets it encode/decode 12 or 24 bytes at a
1919
time, which gives a speedup of four or more times compared to the "plain"
2020
bytewise codec.
2121

22+
AVX512 support is only for encoding at present, utilizing the AVX512 VL and VBMI
23+
instructions. Decoding part reused AVX2 implementations. For CPUs later than
24+
Cannonlake (manufactured in 2018) supports these instructions.
25+
2226
NEON support is hardcoded to on or off at compile time, because portable
2327
runtime feature detection is unavailable on ARM.
2428

@@ -59,6 +63,9 @@ optimizations described by Wojciech Muła in a
5963
[articles](http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html).
6064
His own code is [here](https://github.com/WojciechMula/toys/tree/master/base64).
6165

66+
The AVX512 encoder is based on code from Wojciech Muła's
67+
[base64simd](https://github.com/WojciechMula/base64simd) library.
68+
6269
The OpenMP implementation was added by Ferry Toth (@htot) from [Exalon Delft](http://www.exalondelft.nl).
6370

6471
## Building
@@ -76,8 +83,8 @@ To compile just the "plain" library without SIMD codecs, type:
7683
make lib/libbase64.o
7784
```
7885

79-
Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `NEON32_CFLAGS`, `NEON64_CFLAGS`,
80-
`SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
86+
Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `AVX512_CFLAGS`,
87+
`NEON32_CFLAGS`, `NEON64_CFLAGS`, `SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
8188
A typical build invocation on x86 looks like this:
8289

8390
```sh
@@ -93,6 +100,15 @@ Example:
93100
AVX2_CFLAGS=-mavx2 make
94101
```
95102

103+
### AVX512
104+
105+
To build and include the AVX512 codec, set the `AVX512_CFLAGS` environment variable to a value that will turn on AVX512 support in your compiler, typically `-mavx512vl -mavx512vbmi`.
106+
Example:
107+
108+
```sh
109+
AVX512_CFLAGS="-mavx512vl -mavx512vbmi" make
110+
```
111+
96112
The codec will only be used if runtime feature detection shows that the target machine supports AVX2.
97113

98114
### SSSE3
@@ -208,6 +224,7 @@ Mainly there for testing purposes, this is also useful on ARM where the only way
208224
The following constants can be used:
209225

210226
- `BASE64_FORCE_AVX2`
227+
- `BASE64_FORCE_AVX512`
211228
- `BASE64_FORCE_NEON32`
212229
- `BASE64_FORCE_NEON64`
213230
- `BASE64_FORCE_PLAIN`
@@ -434,7 +451,7 @@ x86 processors
434451
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884\* | 7099\* | 4917\* | 7057\* | 4799\* | 7143\* | 4902\* | 7219\* |
435452
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212\* | 8849\* | 5284\* | 9099\* | 5289\* | 9220\* | 4849\* | 9200\* |
436453
| i7-4870HQ @ 2.5 GHz | 1471\* | 3066\* | 6721\* | 6962\* | 7015\* | 8267\* | 8328\* | 11576\* |
437-
| i5-4590S @ 3.0 GHz | 3356 | 3197 | 4363 | 6104 | 4243 | 6233 | 4160 | 6344 |
454+
| i5-4590S @ 3.0 GHz | 3356 | 3197 | 4363 | 6104 | 4243\* | 6233 | 4160\* | 6344 |
438455
| Xeon X5570 @ 2.93 GHz | 2161 | 1508 | 3160 | 3915 | - | - | - | - |
439456
| Pentium4 @ 3.4 GHz | 896 | 740 | - | - | - | - | - | - |
440457
| Atom N270 | 243 | 266 | 508 | 387 | - | - | - | - |

‎deps/base64/base64/bin/base64.c

+434-85
Large diffs are not rendered by default.

‎deps/base64/base64/cmake/Modules/TargetSIMDInstructionSet.cmake

+2
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ macro(define_SIMD_compile_flags)
2121
set(COMPILE_FLAGS_SSE42 "-msse4.2")
2222
set(COMPILE_FLAGS_AVX "-mavx")
2323
set(COMPILE_FLAGS_AVX2 "-mavx2")
24+
set(COMPILE_FLAGS_AVX512 "-mavx512vl -mavx512vbmi")
2425

2526
#arm
2627
set(COMPILE_FLAGS_NEON32 "-mfpu=neon")
@@ -30,5 +31,6 @@ macro(define_SIMD_compile_flags)
3031
set(COMPILE_FLAGS_SSE42 " ")
3132
set(COMPILE_FLAGS_AVX "/arch:AVX")
3233
set(COMPILE_FLAGS_AVX2 "/arch:AVX2")
34+
set(COMPILE_FLAGS_AVX512 "/arch:AVX512")
3335
endif()
3436
endmacro(define_SIMD_compile_flags)

‎deps/base64/base64/cmake/config.h.in

+3
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
#cmakedefine01 BASE64_WITH_AVX2
1717
#define HAVE_AVX2 BASE64_WITH_AVX2
1818

19+
#cmakedefine01 BASE64_WITH_AVX512
20+
#define HAVE_AVX512 BASE64_WITH_AVX512
21+
1922
#cmakedefine01 BASE64_WITH_NEON32
2023
#define HAVE_NEON32 BASE64_WITH_NEON32
2124

‎deps/base64/base64/include/libbase64.h

+1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ extern "C" {
5353
#define BASE64_FORCE_SSE41 (1 << 5)
5454
#define BASE64_FORCE_SSE42 (1 << 6)
5555
#define BASE64_FORCE_AVX (1 << 7)
56+
#define BASE64_FORCE_AVX512 (1 << 8)
5657

5758
struct base64_state {
5859
int eof;

‎deps/base64/base64/lib/arch/avx/codec.c

+27-3
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,43 @@
1111
#if HAVE_AVX
1212
#include <immintrin.h>
1313

14+
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
15+
#ifndef BASE64_AVX_USE_ASM
16+
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
17+
# define BASE64_AVX_USE_ASM 1
18+
# else
19+
# define BASE64_AVX_USE_ASM 0
20+
# endif
21+
#endif
22+
1423
#include "../ssse3/dec_reshuffle.c"
1524
#include "../ssse3/dec_loop.c"
16-
#include "../ssse3/enc_translate.c"
17-
#include "../ssse3/enc_reshuffle.c"
18-
#include "../ssse3/enc_loop.c"
25+
26+
#if BASE64_AVX_USE_ASM
27+
# include "enc_loop_asm.c"
28+
#else
29+
# include "../ssse3/enc_translate.c"
30+
# include "../ssse3/enc_reshuffle.c"
31+
# include "../ssse3/enc_loop.c"
32+
#endif
1933

2034
#endif // HAVE_AVX
2135

2236
BASE64_ENC_FUNCTION(avx)
2337
{
2438
#if HAVE_AVX
2539
#include "../generic/enc_head.c"
40+
41+
// For supported compilers, use a hand-optimized inline assembly
42+
// encoder. Otherwise fall back on the SSSE3 encoder, but compiled with
43+
// AVX flags to generate better optimized AVX code.
44+
45+
#if BASE64_AVX_USE_ASM
46+
enc_loop_avx(&s, &slen, &o, &olen);
47+
#else
2648
enc_loop_ssse3(&s, &slen, &o, &olen);
49+
#endif
50+
2751
#include "../generic/enc_tail.c"
2852
#else
2953
BASE64_ENC_STUB
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
// Apologies in advance for combining the preprocessor with inline assembly,
2+
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
3+
// code repetition. The preprocessor is used to template large sections of
4+
// inline assembly that differ only in the registers used. If the code was
5+
// written out by hand, it would become very large and hard to audit.
6+
7+
// Generate a block of inline assembly that loads register R0 from memory. The
8+
// offset at which the register is loaded is set by the given round.
9+
#define LOAD(R0, ROUND) \
10+
"vlddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
11+
12+
// Generate a block of inline assembly that deinterleaves and shuffles register
13+
// R0 using preloaded constants. Outputs in R0 and R1.
14+
#define SHUF(R0, R1, R2) \
15+
"vpshufb %[lut0], %["R0"], %["R1"] \n\t" \
16+
"vpand %["R1"], %[msk0], %["R2"] \n\t" \
17+
"vpand %["R1"], %[msk2], %["R1"] \n\t" \
18+
"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
19+
"vpmullw %["R1"], %[msk3], %["R1"] \n\t" \
20+
"vpor %["R1"], %["R2"], %["R1"] \n\t"
21+
22+
// Generate a block of inline assembly that takes R0 and R1 and translates
23+
// their contents to the base64 alphabet, using preloaded constants.
24+
#define TRAN(R0, R1, R2) \
25+
"vpsubusb %[n51], %["R1"], %["R0"] \n\t" \
26+
"vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \
27+
"vpsubb %["R2"], %["R0"], %["R0"] \n\t" \
28+
"vpshufb %["R0"], %[lut1], %["R2"] \n\t" \
29+
"vpaddb %["R1"], %["R2"], %["R0"] \n\t"
30+
31+
// Generate a block of inline assembly that stores the given register R0 at an
32+
// offset set by the given round.
33+
#define STOR(R0, ROUND) \
34+
"vmovdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
35+
36+
// Generate a block of inline assembly that generates a single self-contained
37+
// encoder round: fetch the data, process it, and store the result. Then update
38+
// the source and destination pointers.
39+
#define ROUND() \
40+
LOAD("a", 0) \
41+
SHUF("a", "b", "c") \
42+
TRAN("a", "b", "c") \
43+
STOR("a", 0) \
44+
"add $12, %[src] \n\t" \
45+
"add $16, %[dst] \n\t"
46+
47+
// Define a macro that initiates a three-way interleaved encoding round by
48+
// preloading registers a, b and c from memory.
49+
// The register graph shows which registers are in use during each step, and
50+
// is a visual aid for choosing registers for that step. Symbol index:
51+
//
52+
// + indicates that a register is loaded by that step.
53+
// | indicates that a register is in use and must not be touched.
54+
// - indicates that a register is decommissioned by that step.
55+
// x indicates that a register is used as a temporary by that step.
56+
// V indicates that a register is an input or output to the macro.
57+
//
58+
#define ROUND_3_INIT() /* a b c d e f */ \
59+
LOAD("a", 0) /* + */ \
60+
SHUF("a", "d", "e") /* | + x */ \
61+
LOAD("b", 1) /* | + | */ \
62+
TRAN("a", "d", "e") /* | | - x */ \
63+
LOAD("c", 2) /* V V V */
64+
65+
// Define a macro that translates, shuffles and stores the input registers A, B
66+
// and C, and preloads registers D, E and F for the next round.
67+
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
68+
// and F back into the next round as input registers A, B and C. The macro
69+
// carefully interleaves memory operations with data operations for optimal
70+
// pipelined performance.
71+
72+
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
73+
LOAD(D, (ROUND + 3)) /* V V V + */ \
74+
SHUF(B, E, F) /* | | | | + x */ \
75+
STOR(A, (ROUND + 0)) /* - | | | | */ \
76+
TRAN(B, E, F) /* | | | - x */ \
77+
LOAD(E, (ROUND + 4)) /* | | | + */ \
78+
SHUF(C, A, F) /* + | | | | x */ \
79+
STOR(B, (ROUND + 1)) /* | - | | | */ \
80+
TRAN(C, A, F) /* - | | | x */ \
81+
LOAD(F, (ROUND + 5)) /* | | | + */ \
82+
SHUF(D, A, B) /* + x | | | | */ \
83+
STOR(C, (ROUND + 2)) /* | - | | | */ \
84+
TRAN(D, A, B) /* - x V V V */
85+
86+
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
87+
// registers D, E and F, and translating, shuffling and storing them.
88+
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
89+
SHUF(E, A, B) /* + x V V V */ \
90+
STOR(D, (ROUND + 3)) /* | - | | */ \
91+
TRAN(E, A, B) /* - x | | */ \
92+
SHUF(F, C, D) /* + x | | */ \
93+
STOR(E, (ROUND + 4)) /* | - | */ \
94+
TRAN(F, C, D) /* - x | */ \
95+
STOR(F, (ROUND + 5)) /* - */
96+
97+
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
98+
#define ROUND_3_A(ROUND) \
99+
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
100+
101+
// Define a type B round. Inputs and outputs are swapped with regard to type A.
102+
#define ROUND_3_B(ROUND) \
103+
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
104+
105+
// Terminating macro for a type A round.
106+
#define ROUND_3_A_LAST(ROUND) \
107+
ROUND_3_A(ROUND) \
108+
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
109+
110+
// Terminating macro for a type B round.
111+
#define ROUND_3_B_LAST(ROUND) \
112+
ROUND_3_B(ROUND) \
113+
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
114+
115+
// Suppress clang's warning that the literal string in the asm statement is
116+
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
117+
// compilers). It may be true, but the goal here is not C99 portability.
118+
#pragma GCC diagnostic push
119+
#pragma GCC diagnostic ignored "-Woverlength-strings"
120+
121+
static inline void
122+
enc_loop_avx (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
123+
{
124+
// For a clearer explanation of the algorithm used by this function,
125+
// please refer to the plain (not inline assembly) implementation. This
126+
// function follows the same basic logic.
127+
128+
if (*slen < 16) {
129+
return;
130+
}
131+
132+
// Process blocks of 12 bytes at a time. Input is read in blocks of 16
133+
// bytes, so "reserve" four bytes from the input buffer to ensure that
134+
// we never read beyond the end of the input buffer.
135+
size_t rounds = (*slen - 4) / 12;
136+
137+
*slen -= rounds * 12; // 12 bytes consumed per round
138+
*olen += rounds * 16; // 16 bytes produced per round
139+
140+
// Number of times to go through the 36x loop.
141+
size_t loops = rounds / 36;
142+
143+
// Number of rounds remaining after the 36x loop.
144+
rounds %= 36;
145+
146+
// Lookup tables.
147+
const __m128i lut0 = _mm_set_epi8(
148+
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
149+
150+
const __m128i lut1 = _mm_setr_epi8(
151+
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
152+
153+
// Temporary registers.
154+
__m128i a, b, c, d, e, f;
155+
156+
__asm__ volatile (
157+
158+
// If there are 36 rounds or more, enter a 36x unrolled loop of
159+
// interleaved encoding rounds. The rounds interleave memory
160+
// operations (load/store) with data operations (table lookups,
161+
// etc) to maximize pipeline throughput.
162+
" test %[loops], %[loops] \n\t"
163+
" jz 18f \n\t"
164+
" jmp 36f \n\t"
165+
" \n\t"
166+
".balign 64 \n\t"
167+
"36: " ROUND_3_INIT()
168+
" " ROUND_3_A( 0)
169+
" " ROUND_3_B( 3)
170+
" " ROUND_3_A( 6)
171+
" " ROUND_3_B( 9)
172+
" " ROUND_3_A(12)
173+
" " ROUND_3_B(15)
174+
" " ROUND_3_A(18)
175+
" " ROUND_3_B(21)
176+
" " ROUND_3_A(24)
177+
" " ROUND_3_B(27)
178+
" " ROUND_3_A_LAST(30)
179+
" add $(12 * 36), %[src] \n\t"
180+
" add $(16 * 36), %[dst] \n\t"
181+
" dec %[loops] \n\t"
182+
" jnz 36b \n\t"
183+
184+
// Enter an 18x unrolled loop for rounds of 18 or more.
185+
"18: cmp $18, %[rounds] \n\t"
186+
" jl 9f \n\t"
187+
" " ROUND_3_INIT()
188+
" " ROUND_3_A(0)
189+
" " ROUND_3_B(3)
190+
" " ROUND_3_A(6)
191+
" " ROUND_3_B(9)
192+
" " ROUND_3_A_LAST(12)
193+
" sub $18, %[rounds] \n\t"
194+
" add $(12 * 18), %[src] \n\t"
195+
" add $(16 * 18), %[dst] \n\t"
196+
197+
// Enter a 9x unrolled loop for rounds of 9 or more.
198+
"9: cmp $9, %[rounds] \n\t"
199+
" jl 6f \n\t"
200+
" " ROUND_3_INIT()
201+
" " ROUND_3_A(0)
202+
" " ROUND_3_B_LAST(3)
203+
" sub $9, %[rounds] \n\t"
204+
" add $(12 * 9), %[src] \n\t"
205+
" add $(16 * 9), %[dst] \n\t"
206+
207+
// Enter a 6x unrolled loop for rounds of 6 or more.
208+
"6: cmp $6, %[rounds] \n\t"
209+
" jl 55f \n\t"
210+
" " ROUND_3_INIT()
211+
" " ROUND_3_A_LAST(0)
212+
" sub $6, %[rounds] \n\t"
213+
" add $(12 * 6), %[src] \n\t"
214+
" add $(16 * 6), %[dst] \n\t"
215+
216+
// Dispatch the remaining rounds 0..5.
217+
"55: cmp $3, %[rounds] \n\t"
218+
" jg 45f \n\t"
219+
" je 3f \n\t"
220+
" cmp $1, %[rounds] \n\t"
221+
" jg 2f \n\t"
222+
" je 1f \n\t"
223+
" jmp 0f \n\t"
224+
225+
"45: cmp $4, %[rounds] \n\t"
226+
" je 4f \n\t"
227+
228+
// Block of non-interlaced encoding rounds, which can each
229+
// individually be jumped to. Rounds fall through to the next.
230+
"5: " ROUND()
231+
"4: " ROUND()
232+
"3: " ROUND()
233+
"2: " ROUND()
234+
"1: " ROUND()
235+
"0: \n\t"
236+
237+
// Outputs (modified).
238+
: [rounds] "+r" (rounds),
239+
[loops] "+r" (loops),
240+
[src] "+r" (*s),
241+
[dst] "+r" (*o),
242+
[a] "=&x" (a),
243+
[b] "=&x" (b),
244+
[c] "=&x" (c),
245+
[d] "=&x" (d),
246+
[e] "=&x" (e),
247+
[f] "=&x" (f)
248+
249+
// Inputs (not modified).
250+
: [lut0] "x" (lut0),
251+
[lut1] "x" (lut1),
252+
[msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
253+
[msk1] "x" (_mm_set1_epi32(0x04000040)),
254+
[msk2] "x" (_mm_set1_epi32(0x003F03F0)),
255+
[msk3] "x" (_mm_set1_epi32(0x01000010)),
256+
[n51] "x" (_mm_set1_epi8(51)),
257+
[n25] "x" (_mm_set1_epi8(25))
258+
259+
// Clobbers.
260+
: "cc", "memory"
261+
);
262+
}
263+
264+
#pragma GCC diagnostic pop

‎deps/base64/base64/lib/arch/avx2/codec.c

+17-3
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,25 @@
1111
#if HAVE_AVX2
1212
#include <immintrin.h>
1313

14+
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
15+
#ifndef BASE64_AVX2_USE_ASM
16+
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
17+
# define BASE64_AVX2_USE_ASM 1
18+
# else
19+
# define BASE64_AVX2_USE_ASM 0
20+
# endif
21+
#endif
22+
1423
#include "dec_reshuffle.c"
1524
#include "dec_loop.c"
16-
#include "enc_translate.c"
17-
#include "enc_reshuffle.c"
18-
#include "enc_loop.c"
25+
26+
#if BASE64_AVX2_USE_ASM
27+
# include "enc_loop_asm.c"
28+
#else
29+
# include "enc_translate.c"
30+
# include "enc_reshuffle.c"
31+
# include "enc_loop.c"
32+
#endif
1933

2034
#endif // HAVE_AVX2
2135

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
// Apologies in advance for combining the preprocessor with inline assembly,
2+
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
3+
// code repetition. The preprocessor is used to template large sections of
4+
// inline assembly that differ only in the registers used. If the code was
5+
// written out by hand, it would become very large and hard to audit.
6+
7+
// Generate a block of inline assembly that loads register R0 from memory. The
8+
// offset at which the register is loaded is set by the given round and a
9+
// constant offset.
10+
#define LOAD(R0, ROUND, OFFSET) \
11+
"vlddqu ("#ROUND" * 24 + "#OFFSET")(%[src]), %["R0"] \n\t"
12+
13+
// Generate a block of inline assembly that deinterleaves and shuffles register
14+
// R0 using preloaded constants. Outputs in R0 and R1.
15+
#define SHUF(R0, R1, R2) \
16+
"vpshufb %[lut0], %["R0"], %["R1"] \n\t" \
17+
"vpand %["R1"], %[msk0], %["R2"] \n\t" \
18+
"vpand %["R1"], %[msk2], %["R1"] \n\t" \
19+
"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
20+
"vpmullw %["R1"], %[msk3], %["R1"] \n\t" \
21+
"vpor %["R1"], %["R2"], %["R1"] \n\t"
22+
23+
// Generate a block of inline assembly that takes R0 and R1 and translates
24+
// their contents to the base64 alphabet, using preloaded constants.
25+
#define TRAN(R0, R1, R2) \
26+
"vpsubusb %[n51], %["R1"], %["R0"] \n\t" \
27+
"vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \
28+
"vpsubb %["R2"], %["R0"], %["R0"] \n\t" \
29+
"vpshufb %["R0"], %[lut1], %["R2"] \n\t" \
30+
"vpaddb %["R1"], %["R2"], %["R0"] \n\t"
31+
32+
// Generate a block of inline assembly that stores the given register R0 at an
33+
// offset set by the given round.
34+
#define STOR(R0, ROUND) \
35+
"vmovdqu %["R0"], ("#ROUND" * 32)(%[dst]) \n\t"
36+
37+
// Generate a block of inline assembly that generates a single self-contained
38+
// encoder round: fetch the data, process it, and store the result. Then update
39+
// the source and destination pointers.
40+
#define ROUND() \
41+
LOAD("a", 0, -4) \
42+
SHUF("a", "b", "c") \
43+
TRAN("a", "b", "c") \
44+
STOR("a", 0) \
45+
"add $24, %[src] \n\t" \
46+
"add $32, %[dst] \n\t"
47+
48+
// Define a macro that initiates a three-way interleaved encoding round by
49+
// preloading registers a, b and c from memory.
50+
// The register graph shows which registers are in use during each step, and
51+
// is a visual aid for choosing registers for that step. Symbol index:
52+
//
53+
// + indicates that a register is loaded by that step.
54+
// | indicates that a register is in use and must not be touched.
55+
// - indicates that a register is decommissioned by that step.
56+
// x indicates that a register is used as a temporary by that step.
57+
// V indicates that a register is an input or output to the macro.
58+
//
59+
#define ROUND_3_INIT() /* a b c d e f */ \
60+
LOAD("a", 0, -4) /* + */ \
61+
SHUF("a", "d", "e") /* | + x */ \
62+
LOAD("b", 1, -4) /* | + | */ \
63+
TRAN("a", "d", "e") /* | | - x */ \
64+
LOAD("c", 2, -4) /* V V V */
65+
66+
// Define a macro that translates, shuffles and stores the input registers A, B
67+
// and C, and preloads registers D, E and F for the next round.
68+
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
69+
// and F back into the next round as input registers A, B and C. The macro
70+
// carefully interleaves memory operations with data operations for optimal
71+
// pipelined performance.
72+
73+
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
74+
LOAD(D, (ROUND + 3), -4) /* V V V + */ \
75+
SHUF(B, E, F) /* | | | | + x */ \
76+
STOR(A, (ROUND + 0)) /* - | | | | */ \
77+
TRAN(B, E, F) /* | | | - x */ \
78+
LOAD(E, (ROUND + 4), -4) /* | | | + */ \
79+
SHUF(C, A, F) /* + | | | | x */ \
80+
STOR(B, (ROUND + 1)) /* | - | | | */ \
81+
TRAN(C, A, F) /* - | | | x */ \
82+
LOAD(F, (ROUND + 5), -4) /* | | | + */ \
83+
SHUF(D, A, B) /* + x | | | | */ \
84+
STOR(C, (ROUND + 2)) /* | - | | | */ \
85+
TRAN(D, A, B) /* - x V V V */
86+
87+
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
88+
// registers D, E and F, and translating, shuffling and storing them.
89+
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
90+
SHUF(E, A, B) /* + x V V V */ \
91+
STOR(D, (ROUND + 3)) /* | - | | */ \
92+
TRAN(E, A, B) /* - x | | */ \
93+
SHUF(F, C, D) /* + x | | */ \
94+
STOR(E, (ROUND + 4)) /* | - | */ \
95+
TRAN(F, C, D) /* - x | */ \
96+
STOR(F, (ROUND + 5)) /* - */
97+
98+
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
99+
#define ROUND_3_A(ROUND) \
100+
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
101+
102+
// Define a type B round. Inputs and outputs are swapped with regard to type A.
103+
#define ROUND_3_B(ROUND) \
104+
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
105+
106+
// Terminating macro for a type A round.
107+
#define ROUND_3_A_LAST(ROUND) \
108+
ROUND_3_A(ROUND) \
109+
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
110+
111+
// Terminating macro for a type B round.
112+
#define ROUND_3_B_LAST(ROUND) \
113+
ROUND_3_B(ROUND) \
114+
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
115+
116+
// Suppress clang's warning that the literal string in the asm statement is
117+
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
118+
// compilers). It may be true, but the goal here is not C99 portability.
119+
#pragma GCC diagnostic push
120+
#pragma GCC diagnostic ignored "-Woverlength-strings"
121+
122+
static inline void
123+
enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
124+
{
125+
// For a clearer explanation of the algorithm used by this function,
126+
// please refer to the plain (not inline assembly) implementation. This
127+
// function follows the same basic logic.
128+
129+
if (*slen < 32) {
130+
return;
131+
}
132+
133+
// Process blocks of 24 bytes at a time. Because blocks are loaded 32
134+
// bytes at a time an offset of -4, ensure that there will be at least
135+
// 4 remaining bytes after the last round, so that the final read will
136+
// not pass beyond the bounds of the input buffer.
137+
size_t rounds = (*slen - 4) / 24;
138+
139+
*slen -= rounds * 24; // 24 bytes consumed per round
140+
*olen += rounds * 32; // 32 bytes produced per round
141+
142+
// Pre-decrement the number of rounds to get the number of rounds
143+
// *after* the first round, which is handled as a special case.
144+
rounds--;
145+
146+
// Number of times to go through the 36x loop.
147+
size_t loops = rounds / 36;
148+
149+
// Number of rounds remaining after the 36x loop.
150+
rounds %= 36;
151+
152+
// Lookup tables.
153+
const __m256i lut0 = _mm256_set_epi8(
154+
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
155+
14, 15, 13, 14, 11, 12, 10, 11, 8, 9, 7, 8, 5, 6, 4, 5);
156+
157+
const __m256i lut1 = _mm256_setr_epi8(
158+
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
159+
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
160+
161+
// Temporary registers.
162+
__m256i a, b, c, d, e;
163+
164+
// Temporary register f doubles as the shift mask for the first round.
165+
__m256i f = _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6);
166+
167+
__asm__ volatile (
168+
169+
// The first loop iteration requires special handling to ensure
170+
// that the read, which is normally done at an offset of -4,
171+
// does not underflow the buffer. Load the buffer at an offset
172+
// of 0 and permute the input to achieve the same effect.
173+
LOAD("a", 0, 0)
174+
"vpermd %[a], %[f], %[a] \n\t"
175+
176+
// Perform the standard shuffling and translation steps.
177+
SHUF("a", "b", "c")
178+
TRAN("a", "b", "c")
179+
180+
// Store the result and increment the source and dest pointers.
181+
"vmovdqu %[a], (%[dst]) \n\t"
182+
"add $24, %[src] \n\t"
183+
"add $32, %[dst] \n\t"
184+
185+
// If there are 36 rounds or more, enter a 36x unrolled loop of
186+
// interleaved encoding rounds. The rounds interleave memory
187+
// operations (load/store) with data operations (table lookups,
188+
// etc) to maximize pipeline throughput.
189+
" test %[loops], %[loops] \n\t"
190+
" jz 18f \n\t"
191+
" jmp 36f \n\t"
192+
" \n\t"
193+
".balign 64 \n\t"
194+
"36: " ROUND_3_INIT()
195+
" " ROUND_3_A( 0)
196+
" " ROUND_3_B( 3)
197+
" " ROUND_3_A( 6)
198+
" " ROUND_3_B( 9)
199+
" " ROUND_3_A(12)
200+
" " ROUND_3_B(15)
201+
" " ROUND_3_A(18)
202+
" " ROUND_3_B(21)
203+
" " ROUND_3_A(24)
204+
" " ROUND_3_B(27)
205+
" " ROUND_3_A_LAST(30)
206+
" add $(24 * 36), %[src] \n\t"
207+
" add $(32 * 36), %[dst] \n\t"
208+
" dec %[loops] \n\t"
209+
" jnz 36b \n\t"
210+
211+
// Enter an 18x unrolled loop for rounds of 18 or more.
212+
"18: cmp $18, %[rounds] \n\t"
213+
" jl 9f \n\t"
214+
" " ROUND_3_INIT()
215+
" " ROUND_3_A(0)
216+
" " ROUND_3_B(3)
217+
" " ROUND_3_A(6)
218+
" " ROUND_3_B(9)
219+
" " ROUND_3_A_LAST(12)
220+
" sub $18, %[rounds] \n\t"
221+
" add $(24 * 18), %[src] \n\t"
222+
" add $(32 * 18), %[dst] \n\t"
223+
224+
// Enter a 9x unrolled loop for rounds of 9 or more.
225+
"9: cmp $9, %[rounds] \n\t"
226+
" jl 6f \n\t"
227+
" " ROUND_3_INIT()
228+
" " ROUND_3_A(0)
229+
" " ROUND_3_B_LAST(3)
230+
" sub $9, %[rounds] \n\t"
231+
" add $(24 * 9), %[src] \n\t"
232+
" add $(32 * 9), %[dst] \n\t"
233+
234+
// Enter a 6x unrolled loop for rounds of 6 or more.
235+
"6: cmp $6, %[rounds] \n\t"
236+
" jl 55f \n\t"
237+
" " ROUND_3_INIT()
238+
" " ROUND_3_A_LAST(0)
239+
" sub $6, %[rounds] \n\t"
240+
" add $(24 * 6), %[src] \n\t"
241+
" add $(32 * 6), %[dst] \n\t"
242+
243+
// Dispatch the remaining rounds 0..5.
244+
"55: cmp $3, %[rounds] \n\t"
245+
" jg 45f \n\t"
246+
" je 3f \n\t"
247+
" cmp $1, %[rounds] \n\t"
248+
" jg 2f \n\t"
249+
" je 1f \n\t"
250+
" jmp 0f \n\t"
251+
252+
"45: cmp $4, %[rounds] \n\t"
253+
" je 4f \n\t"
254+
255+
// Block of non-interlaced encoding rounds, which can each
256+
// individually be jumped to. Rounds fall through to the next.
257+
"5: " ROUND()
258+
"4: " ROUND()
259+
"3: " ROUND()
260+
"2: " ROUND()
261+
"1: " ROUND()
262+
"0: \n\t"
263+
264+
// Outputs (modified).
265+
: [rounds] "+r" (rounds),
266+
[loops] "+r" (loops),
267+
[src] "+r" (*s),
268+
[dst] "+r" (*o),
269+
[a] "=&x" (a),
270+
[b] "=&x" (b),
271+
[c] "=&x" (c),
272+
[d] "=&x" (d),
273+
[e] "=&x" (e),
274+
[f] "+x" (f)
275+
276+
// Inputs (not modified).
277+
: [lut0] "x" (lut0),
278+
[lut1] "x" (lut1),
279+
[msk0] "x" (_mm256_set1_epi32(0x0FC0FC00)),
280+
[msk1] "x" (_mm256_set1_epi32(0x04000040)),
281+
[msk2] "x" (_mm256_set1_epi32(0x003F03F0)),
282+
[msk3] "x" (_mm256_set1_epi32(0x01000010)),
283+
[n51] "x" (_mm256_set1_epi8(51)),
284+
[n25] "x" (_mm256_set1_epi8(25))
285+
286+
// Clobbers.
287+
: "cc", "memory"
288+
);
289+
}
290+
291+
#pragma GCC diagnostic pop
+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#include <stdint.h>
2+
#include <stddef.h>
3+
#include <stdlib.h>
4+
5+
#include "../../../include/libbase64.h"
6+
#include "../../tables/tables.h"
7+
#include "../../codecs.h"
8+
#include "config.h"
9+
#include "../../env.h"
10+
11+
#if HAVE_AVX512
12+
#include <immintrin.h>
13+
14+
#include "../avx2/dec_reshuffle.c"
15+
#include "../avx2/dec_loop.c"
16+
#include "enc_reshuffle_translate.c"
17+
#include "enc_loop.c"
18+
19+
#endif // HAVE_AVX512
20+
21+
BASE64_ENC_FUNCTION(avx512)
22+
{
23+
#if HAVE_AVX512
24+
#include "../generic/enc_head.c"
25+
enc_loop_avx512(&s, &slen, &o, &olen);
26+
#include "../generic/enc_tail.c"
27+
#else
28+
BASE64_ENC_STUB
29+
#endif
30+
}
31+
32+
// Reuse AVX2 decoding. Not supporting AVX512 at present
33+
BASE64_DEC_FUNCTION(avx512)
34+
{
35+
#if HAVE_AVX512
36+
#include "../generic/dec_head.c"
37+
dec_loop_avx2(&s, &slen, &o, &olen);
38+
#include "../generic/dec_tail.c"
39+
#else
40+
BASE64_DEC_STUB
41+
#endif
42+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
static inline void
2+
enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
3+
{
4+
// Load input.
5+
__m512i src = _mm512_loadu_si512((__m512i *) *s);
6+
7+
// Reshuffle, translate, store.
8+
src = enc_reshuffle_translate(src);
9+
_mm512_storeu_si512((__m512i *) *o, src);
10+
11+
*s += 48;
12+
*o += 64;
13+
}
14+
15+
static inline void
16+
enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
17+
{
18+
if (*slen < 64) {
19+
return;
20+
}
21+
22+
// Process blocks of 48 bytes at a time. Because blocks are loaded 64
23+
// bytes at a time, ensure that there will be at least 24 remaining
24+
// bytes after the last round, so that the final read will not pass
25+
// beyond the bounds of the input buffer.
26+
size_t rounds = (*slen - 24) / 48;
27+
28+
*slen -= rounds * 48; // 48 bytes consumed per round
29+
*olen += rounds * 64; // 64 bytes produced per round
30+
31+
while (rounds > 0) {
32+
if (rounds >= 8) {
33+
enc_loop_avx512_inner(s, o);
34+
enc_loop_avx512_inner(s, o);
35+
enc_loop_avx512_inner(s, o);
36+
enc_loop_avx512_inner(s, o);
37+
enc_loop_avx512_inner(s, o);
38+
enc_loop_avx512_inner(s, o);
39+
enc_loop_avx512_inner(s, o);
40+
enc_loop_avx512_inner(s, o);
41+
rounds -= 8;
42+
continue;
43+
}
44+
if (rounds >= 4) {
45+
enc_loop_avx512_inner(s, o);
46+
enc_loop_avx512_inner(s, o);
47+
enc_loop_avx512_inner(s, o);
48+
enc_loop_avx512_inner(s, o);
49+
rounds -= 4;
50+
continue;
51+
}
52+
if (rounds >= 2) {
53+
enc_loop_avx512_inner(s, o);
54+
enc_loop_avx512_inner(s, o);
55+
rounds -= 2;
56+
continue;
57+
}
58+
enc_loop_avx512_inner(s, o);
59+
break;
60+
}
61+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// AVX512 algorithm is based on permutevar and multishift. The code is based on
2+
// https://github.com/WojciechMula/base64simd which is under BSD-2 license.
3+
4+
static inline __m512i
5+
enc_reshuffle_translate (const __m512i input)
6+
{
7+
// 32-bit input
8+
// [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0|
9+
// b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4]
10+
// output order [1, 2, 0, 1]
11+
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
12+
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
13+
14+
const __m512i shuffle_input = _mm512_setr_epi32(0x01020001,
15+
0x04050304,
16+
0x07080607,
17+
0x0a0b090a,
18+
0x0d0e0c0d,
19+
0x10110f10,
20+
0x13141213,
21+
0x16171516,
22+
0x191a1819,
23+
0x1c1d1b1c,
24+
0x1f201e1f,
25+
0x22232122,
26+
0x25262425,
27+
0x28292728,
28+
0x2b2c2a2b,
29+
0x2e2f2d2e);
30+
31+
// Reorder bytes
32+
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
33+
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
34+
const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input);
35+
36+
// After multishift a single 32-bit lane has following layout
37+
// [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|
38+
// a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0]
39+
// (a = [10:17], b = [4:11], c = [22:27], d = [16:21])
40+
41+
// 48, 54, 36, 42, 16, 22, 4, 10
42+
const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu);
43+
__m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in);
44+
45+
// Translate immediatedly after reshuffled.
46+
const __m512i lookup = _mm512_loadu_si512(base64_table_enc_6bit);
47+
48+
// Translation 6-bit values to ASCII.
49+
return _mm512_permutexvar_epi8(shuffled_in, lookup);
50+
}

‎deps/base64/base64/lib/arch/neon32/enc_loop.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,8 @@ enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
100100
[n63] "w" (n63)
101101

102102
// Clobbers.
103-
: "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
103+
: "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
104+
"cc", "memory"
104105
);
105106
}
106107
#endif

‎deps/base64/base64/lib/arch/neon64/enc_loop_asm.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,8 @@ enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
160160
// Clobbers.
161161
: "v2", "v3", "v4", "v5",
162162
"v8", "v9", "v10", "v11",
163-
"v12", "v13", "v14", "v15"
163+
"v12", "v13", "v14", "v15",
164+
"cc", "memory"
164165
);
165166
}
166167

‎deps/base64/base64/lib/arch/sse41/codec.c

+17-3
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,25 @@
1111
#if HAVE_SSE41
1212
#include <smmintrin.h>
1313

14+
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
15+
#ifndef BASE64_SSE41_USE_ASM
16+
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
17+
# define BASE64_SSE41_USE_ASM 1
18+
# else
19+
# define BASE64_SSE41_USE_ASM 0
20+
# endif
21+
#endif
22+
1423
#include "../ssse3/dec_reshuffle.c"
1524
#include "../ssse3/dec_loop.c"
16-
#include "../ssse3/enc_translate.c"
17-
#include "../ssse3/enc_reshuffle.c"
18-
#include "../ssse3/enc_loop.c"
25+
26+
#if BASE64_SSE41_USE_ASM
27+
# include "../ssse3/enc_loop_asm.c"
28+
#else
29+
# include "../ssse3/enc_translate.c"
30+
# include "../ssse3/enc_reshuffle.c"
31+
# include "../ssse3/enc_loop.c"
32+
#endif
1933

2034
#endif // HAVE_SSE41
2135

‎deps/base64/base64/lib/arch/sse42/codec.c

+17-3
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,25 @@
1111
#if HAVE_SSE42
1212
#include <nmmintrin.h>
1313

14+
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
15+
#ifndef BASE64_SSE42_USE_ASM
16+
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
17+
# define BASE64_SSE42_USE_ASM 1
18+
# else
19+
# define BASE64_SSE42_USE_ASM 0
20+
# endif
21+
#endif
22+
1423
#include "../ssse3/dec_reshuffle.c"
1524
#include "../ssse3/dec_loop.c"
16-
#include "../ssse3/enc_translate.c"
17-
#include "../ssse3/enc_reshuffle.c"
18-
#include "../ssse3/enc_loop.c"
25+
26+
#if BASE64_SSE42_USE_ASM
27+
# include "../ssse3/enc_loop_asm.c"
28+
#else
29+
# include "../ssse3/enc_translate.c"
30+
# include "../ssse3/enc_reshuffle.c"
31+
# include "../ssse3/enc_loop.c"
32+
#endif
1933

2034
#endif // HAVE_SSE42
2135

‎deps/base64/base64/lib/arch/ssse3/codec.c

+19-3
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,27 @@
1111
#if HAVE_SSSE3
1212
#include <tmmintrin.h>
1313

14+
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
15+
// 32-bit CPUs with SSSE3 support, such as low-end Atoms, only have eight XMM
16+
// registers, which is not enough to run the inline assembly.
17+
#ifndef BASE64_SSSE3_USE_ASM
18+
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
19+
# define BASE64_SSSE3_USE_ASM 1
20+
# else
21+
# define BASE64_SSSE3_USE_ASM 0
22+
# endif
23+
#endif
24+
1425
#include "dec_reshuffle.c"
1526
#include "dec_loop.c"
16-
#include "enc_reshuffle.c"
17-
#include "enc_translate.c"
18-
#include "enc_loop.c"
27+
28+
#if BASE64_SSSE3_USE_ASM
29+
# include "enc_loop_asm.c"
30+
#else
31+
# include "enc_reshuffle.c"
32+
# include "enc_translate.c"
33+
# include "enc_loop.c"
34+
#endif
1935

2036
#endif // HAVE_SSSE3
2137

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
// Apologies in advance for combining the preprocessor with inline assembly,
2+
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
3+
// code repetition. The preprocessor is used to template large sections of
4+
// inline assembly that differ only in the registers used. If the code was
5+
// written out by hand, it would become very large and hard to audit.
6+
7+
// Generate a block of inline assembly that loads register R0 from memory. The
8+
// offset at which the register is loaded is set by the given round.
9+
#define LOAD(R0, ROUND) \
10+
"lddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
11+
12+
// Generate a block of inline assembly that deinterleaves and shuffles register
13+
// R0 using preloaded constants. Outputs in R0 and R1.
14+
#define SHUF(R0, R1) \
15+
"pshufb %[lut0], %["R0"] \n\t" \
16+
"movdqa %["R0"], %["R1"] \n\t" \
17+
"pand %[msk0], %["R0"] \n\t" \
18+
"pand %[msk2], %["R1"] \n\t" \
19+
"pmulhuw %[msk1], %["R0"] \n\t" \
20+
"pmullw %[msk3], %["R1"] \n\t" \
21+
"por %["R1"], %["R0"] \n\t"
22+
23+
// Generate a block of inline assembly that takes R0 and R1 and translates
24+
// their contents to the base64 alphabet, using preloaded constants.
25+
#define TRAN(R0, R1, R2) \
26+
"movdqa %["R0"], %["R1"] \n\t" \
27+
"movdqa %["R0"], %["R2"] \n\t" \
28+
"psubusb %[n51], %["R1"] \n\t" \
29+
"pcmpgtb %[n25], %["R2"] \n\t" \
30+
"psubb %["R2"], %["R1"] \n\t" \
31+
"movdqa %[lut1], %["R2"] \n\t" \
32+
"pshufb %["R1"], %["R2"] \n\t" \
33+
"paddb %["R2"], %["R0"] \n\t"
34+
35+
// Generate a block of inline assembly that stores the given register R0 at an
36+
// offset set by the given round.
37+
#define STOR(R0, ROUND) \
38+
"movdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
39+
40+
// Generate a block of inline assembly that generates a single self-contained
41+
// encoder round: fetch the data, process it, and store the result. Then update
42+
// the source and destination pointers.
43+
#define ROUND() \
44+
LOAD("a", 0) \
45+
SHUF("a", "b") \
46+
TRAN("a", "b", "c") \
47+
STOR("a", 0) \
48+
"add $12, %[src] \n\t" \
49+
"add $16, %[dst] \n\t"
50+
51+
// Define a macro that initiates a three-way interleaved encoding round by
52+
// preloading registers a, b and c from memory.
53+
// The register graph shows which registers are in use during each step, and
54+
// is a visual aid for choosing registers for that step. Symbol index:
55+
//
56+
// + indicates that a register is loaded by that step.
57+
// | indicates that a register is in use and must not be touched.
58+
// - indicates that a register is decommissioned by that step.
59+
// x indicates that a register is used as a temporary by that step.
60+
// V indicates that a register is an input or output to the macro.
61+
//
62+
#define ROUND_3_INIT() /* a b c d e f */ \
63+
LOAD("a", 0) /* + */ \
64+
SHUF("a", "d") /* | + */ \
65+
LOAD("b", 1) /* | + | */ \
66+
TRAN("a", "d", "e") /* | | - x */ \
67+
LOAD("c", 2) /* V V V */
68+
69+
// Define a macro that translates, shuffles and stores the input registers A, B
70+
// and C, and preloads registers D, E and F for the next round.
71+
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
72+
// and F back into the next round as input registers A, B and C. The macro
73+
// carefully interleaves memory operations with data operations for optimal
74+
// pipelined performance.
75+
76+
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
77+
LOAD(D, (ROUND + 3)) /* V V V + */ \
78+
SHUF(B, E) /* | | | | + */ \
79+
STOR(A, (ROUND + 0)) /* - | | | | */ \
80+
TRAN(B, E, F) /* | | | - x */ \
81+
LOAD(E, (ROUND + 4)) /* | | | + */ \
82+
SHUF(C, A) /* + | | | | */ \
83+
STOR(B, (ROUND + 1)) /* | - | | | */ \
84+
TRAN(C, A, F) /* - | | | x */ \
85+
LOAD(F, (ROUND + 5)) /* | | | + */ \
86+
SHUF(D, A) /* + | | | | */ \
87+
STOR(C, (ROUND + 2)) /* | - | | | */ \
88+
TRAN(D, A, B) /* - x V V V */
89+
90+
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
91+
// registers D, E and F, and translating, shuffling and storing them.
92+
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
93+
SHUF(E, A) /* + V V V */ \
94+
STOR(D, (ROUND + 3)) /* | - | | */ \
95+
TRAN(E, A, B) /* - x | | */ \
96+
SHUF(F, C) /* + | | */ \
97+
STOR(E, (ROUND + 4)) /* | - | */ \
98+
TRAN(F, C, D) /* - x | */ \
99+
STOR(F, (ROUND + 5)) /* - */
100+
101+
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
102+
#define ROUND_3_A(ROUND) \
103+
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
104+
105+
// Define a type B round. Inputs and outputs are swapped with regard to type A.
106+
#define ROUND_3_B(ROUND) \
107+
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
108+
109+
// Terminating macro for a type A round.
110+
#define ROUND_3_A_LAST(ROUND) \
111+
ROUND_3_A(ROUND) \
112+
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
113+
114+
// Terminating macro for a type B round.
115+
#define ROUND_3_B_LAST(ROUND) \
116+
ROUND_3_B(ROUND) \
117+
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
118+
119+
// Suppress clang's warning that the literal string in the asm statement is
120+
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
121+
// compilers). It may be true, but the goal here is not C99 portability.
122+
#pragma GCC diagnostic push
123+
#pragma GCC diagnostic ignored "-Woverlength-strings"
124+
125+
static inline void
126+
enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
127+
{
128+
// For a clearer explanation of the algorithm used by this function,
129+
// please refer to the plain (not inline assembly) implementation. This
130+
// function follows the same basic logic.
131+
132+
if (*slen < 16) {
133+
return;
134+
}
135+
136+
// Process blocks of 12 bytes at a time. Input is read in blocks of 16
137+
// bytes, so "reserve" four bytes from the input buffer to ensure that
138+
// we never read beyond the end of the input buffer.
139+
size_t rounds = (*slen - 4) / 12;
140+
141+
*slen -= rounds * 12; // 12 bytes consumed per round
142+
*olen += rounds * 16; // 16 bytes produced per round
143+
144+
// Number of times to go through the 36x loop.
145+
size_t loops = rounds / 36;
146+
147+
// Number of rounds remaining after the 36x loop.
148+
rounds %= 36;
149+
150+
// Lookup tables.
151+
const __m128i lut0 = _mm_set_epi8(
152+
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
153+
154+
const __m128i lut1 = _mm_setr_epi8(
155+
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
156+
157+
// Temporary registers.
158+
__m128i a, b, c, d, e, f;
159+
160+
__asm__ volatile (
161+
162+
// If there are 36 rounds or more, enter a 36x unrolled loop of
163+
// interleaved encoding rounds. The rounds interleave memory
164+
// operations (load/store) with data operations (table lookups,
165+
// etc) to maximize pipeline throughput.
166+
" test %[loops], %[loops] \n\t"
167+
" jz 18f \n\t"
168+
" jmp 36f \n\t"
169+
" \n\t"
170+
".balign 64 \n\t"
171+
"36: " ROUND_3_INIT()
172+
" " ROUND_3_A( 0)
173+
" " ROUND_3_B( 3)
174+
" " ROUND_3_A( 6)
175+
" " ROUND_3_B( 9)
176+
" " ROUND_3_A(12)
177+
" " ROUND_3_B(15)
178+
" " ROUND_3_A(18)
179+
" " ROUND_3_B(21)
180+
" " ROUND_3_A(24)
181+
" " ROUND_3_B(27)
182+
" " ROUND_3_A_LAST(30)
183+
" add $(12 * 36), %[src] \n\t"
184+
" add $(16 * 36), %[dst] \n\t"
185+
" dec %[loops] \n\t"
186+
" jnz 36b \n\t"
187+
188+
// Enter an 18x unrolled loop for rounds of 18 or more.
189+
"18: cmp $18, %[rounds] \n\t"
190+
" jl 9f \n\t"
191+
" " ROUND_3_INIT()
192+
" " ROUND_3_A(0)
193+
" " ROUND_3_B(3)
194+
" " ROUND_3_A(6)
195+
" " ROUND_3_B(9)
196+
" " ROUND_3_A_LAST(12)
197+
" sub $18, %[rounds] \n\t"
198+
" add $(12 * 18), %[src] \n\t"
199+
" add $(16 * 18), %[dst] \n\t"
200+
201+
// Enter a 9x unrolled loop for rounds of 9 or more.
202+
"9: cmp $9, %[rounds] \n\t"
203+
" jl 6f \n\t"
204+
" " ROUND_3_INIT()
205+
" " ROUND_3_A(0)
206+
" " ROUND_3_B_LAST(3)
207+
" sub $9, %[rounds] \n\t"
208+
" add $(12 * 9), %[src] \n\t"
209+
" add $(16 * 9), %[dst] \n\t"
210+
211+
// Enter a 6x unrolled loop for rounds of 6 or more.
212+
"6: cmp $6, %[rounds] \n\t"
213+
" jl 55f \n\t"
214+
" " ROUND_3_INIT()
215+
" " ROUND_3_A_LAST(0)
216+
" sub $6, %[rounds] \n\t"
217+
" add $(12 * 6), %[src] \n\t"
218+
" add $(16 * 6), %[dst] \n\t"
219+
220+
// Dispatch the remaining rounds 0..5.
221+
"55: cmp $3, %[rounds] \n\t"
222+
" jg 45f \n\t"
223+
" je 3f \n\t"
224+
" cmp $1, %[rounds] \n\t"
225+
" jg 2f \n\t"
226+
" je 1f \n\t"
227+
" jmp 0f \n\t"
228+
229+
"45: cmp $4, %[rounds] \n\t"
230+
" je 4f \n\t"
231+
232+
// Block of non-interlaced encoding rounds, which can each
233+
// individually be jumped to. Rounds fall through to the next.
234+
"5: " ROUND()
235+
"4: " ROUND()
236+
"3: " ROUND()
237+
"2: " ROUND()
238+
"1: " ROUND()
239+
"0: \n\t"
240+
241+
// Outputs (modified).
242+
: [rounds] "+r" (rounds),
243+
[loops] "+r" (loops),
244+
[src] "+r" (*s),
245+
[dst] "+r" (*o),
246+
[a] "=&x" (a),
247+
[b] "=&x" (b),
248+
[c] "=&x" (c),
249+
[d] "=&x" (d),
250+
[e] "=&x" (e),
251+
[f] "=&x" (f)
252+
253+
// Inputs (not modified).
254+
: [lut0] "x" (lut0),
255+
[lut1] "x" (lut1),
256+
[msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
257+
[msk1] "x" (_mm_set1_epi32(0x04000040)),
258+
[msk2] "x" (_mm_set1_epi32(0x003F03F0)),
259+
[msk3] "x" (_mm_set1_epi32(0x01000010)),
260+
[n51] "x" (_mm_set1_epi8(51)),
261+
[n25] "x" (_mm_set1_epi8(25))
262+
263+
// Clobbers.
264+
: "cc", "memory"
265+
);
266+
}
267+
268+
#pragma GCC diagnostic pop

‎deps/base64/base64/lib/codec_choose.c

+30-6
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <stdint.h>
33
#include <stddef.h>
44
#include <stdint.h>
5+
#include <stdio.h>
56

67
#include "../include/libbase64.h"
78
#include "codecs.h"
@@ -10,7 +11,7 @@
1011

1112
#if (__x86_64__ || __i386__ || _M_X86 || _M_X64)
1213
#define BASE64_X86
13-
#if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2)
14+
#if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2 || HAVE_AVX512)
1415
#define BASE64_X86_SIMD
1516
#endif
1617
#endif
@@ -31,7 +32,7 @@
3132
__cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx)
3233
#else
3334
#include <cpuid.h>
34-
#if HAVE_AVX2 || HAVE_AVX
35+
#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
3536
#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3))
3637
static inline uint64_t _xgetbv (uint32_t index)
3738
{
@@ -45,6 +46,12 @@
4546
#endif
4647
#endif
4748

49+
#ifndef bit_AVX512vl
50+
#define bit_AVX512vl (1 << 31)
51+
#endif
52+
#ifndef bit_AVX512vbmi
53+
#define bit_AVX512vbmi (1 << 1)
54+
#endif
4855
#ifndef bit_AVX2
4956
#define bit_AVX2 (1 << 5)
5057
#endif
@@ -75,6 +82,7 @@
7582
BASE64_ENC_FUNCTION(arch); \
7683
BASE64_DEC_FUNCTION(arch); \
7784

85+
BASE64_CODEC_FUNCS(avx512)
7886
BASE64_CODEC_FUNCS(avx2)
7987
BASE64_CODEC_FUNCS(neon32)
8088
BASE64_CODEC_FUNCS(neon64)
@@ -91,9 +99,10 @@ codec_choose_forced (struct codec *codec, int flags)
9199
// always allow it, even if the codec is a no-op.
92100
// For testing purposes.
93101

94-
if (!(flags & 0xFF)) {
102+
if (!(flags & 0xFFFF)) {
95103
return false;
96104
}
105+
97106
if (flags & BASE64_FORCE_AVX2) {
98107
codec->enc = base64_stream_encode_avx2;
99108
codec->dec = base64_stream_decode_avx2;
@@ -134,6 +143,11 @@ codec_choose_forced (struct codec *codec, int flags)
134143
codec->dec = base64_stream_decode_avx;
135144
return true;
136145
}
146+
if (flags & BASE64_FORCE_AVX512) {
147+
codec->enc = base64_stream_encode_avx512;
148+
codec->dec = base64_stream_decode_avx512;
149+
return true;
150+
}
137151
return false;
138152
}
139153

@@ -178,8 +192,8 @@ codec_choose_x86 (struct codec *codec)
178192
max_level = __get_cpuid_max(0, NULL);
179193
#endif
180194

181-
#if HAVE_AVX2 || HAVE_AVX
182-
// Check for AVX/AVX2 support:
195+
#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
196+
// Check for AVX/AVX2/AVX512 support:
183197
// Checking for AVX requires 3 things:
184198
// 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions
185199
// (allowing saving YMM registers on context switch)
@@ -194,7 +208,17 @@ codec_choose_x86 (struct codec *codec)
194208
if (ecx & bit_XSAVE_XRSTORE) {
195209
uint64_t xcr_mask;
196210
xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
197-
if (xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) {
211+
if ((xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) == _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) { // check multiple bits at once
212+
#if HAVE_AVX512
213+
if (max_level >= 7) {
214+
__cpuid_count(7, 0, eax, ebx, ecx, edx);
215+
if ((ebx & bit_AVX512vl) && (ecx & bit_AVX512vbmi)) {
216+
codec->enc = base64_stream_encode_avx512;
217+
codec->dec = base64_stream_decode_avx512;
218+
return true;
219+
}
220+
}
221+
#endif
198222
#if HAVE_AVX2
199223
if (max_level >= 7) {
200224
__cpuid_count(7, 0, eax, ebx, ecx, edx);

‎deps/base64/base64/lib/lib.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ void
6868
base64_stream_decode_init (struct base64_state *state, int flags)
6969
{
7070
// If any of the codec flags are set, redo choice:
71-
if (codec.dec == NULL || flags & 0xFF) {
71+
if (codec.dec == NULL || flags & 0xFFFF) {
7272
codec_choose(&codec, flags);
7373
}
7474
state->eof = 0;

‎deps/base64/base64/test/Makefile

+4-1
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,15 @@ else
1111
BENCH_LDFLAGS=-lrt
1212
endif
1313

14-
.PHONY: clean test
14+
.PHONY: clean test valgrind
1515

1616
test: clean test_base64 benchmark
1717
./test_base64
1818
./benchmark
1919

20+
valgrind: clean test_base64
21+
valgrind --error-exitcode=2 ./test_base64
22+
2023
test_base64: test_base64.c codec_supported.o ../lib/libbase64.o
2124
$(CC) $(CFLAGS) -o $@ $^
2225

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/bin/bash
2+
set -ve
3+
4+
MACHINE=$(uname -m)
5+
export CC=gcc
6+
7+
uname -a
8+
clang --version # make analyse
9+
${CC} --version # make -C test valgrind
10+
11+
for USE_ASSEMBLY in 0 1; do
12+
if [ "${MACHINE}" == "x86_64" ]; then
13+
export SSSE3_CFLAGS="-mssse3 -DBASE64_SSSE3_USE_ASM=${USE_ASSEMBLY}"
14+
export SSE41_CFLAGS="-msse4.1 -DBASE64_SSE41_USE_ASM=${USE_ASSEMBLY}"
15+
export SSE42_CFLAGS="-msse4.2 -DBASE64_SSE42_USE_ASM=${USE_ASSEMBLY}"
16+
export AVX_CFLAGS="-mavx -DBASE64_AVX_USE_ASM=${USE_ASSEMBLY}"
17+
export AVX2_CFLAGS="-mavx2 -DBASE64_AVX2_USE_ASM=${USE_ASSEMBLY}"
18+
# Temporarily disable AVX512; it is not available in CI yet.
19+
# export AVX512_CFLAGS="-mavx512vl -mavx512vbmi"
20+
elif [ "${MACHINE}" == "aarch64" ]; then
21+
export NEON64_CFLAGS="-march=armv8-a"
22+
elif [ "${MACHINE}" == "armv7l" ]; then
23+
export NEON32_CFLAGS="-march=armv7-a -mfloat-abi=hard -mfpu=neon"
24+
fi
25+
26+
if [ ${USE_ASSEMBLY} -eq 0 ]; then
27+
echo "::group::analyze"
28+
make analyze
29+
echo "::endgroup::"
30+
fi
31+
32+
echo "::group::valgrind (USE_ASSEMBLY=${USE_ASSEMBLY})"
33+
make clean
34+
make
35+
make -C test valgrind
36+
echo "::endgroup::"
37+
done

‎deps/base64/base64/test/ci/test.sh

+3-1
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@ if [ "${MACHINE}" == "x86_64" ]; then
77
export SSE41_CFLAGS=-msse4.1
88
export SSE42_CFLAGS=-msse4.2
99
export AVX_CFLAGS=-mavx
10-
# no AVX2 on GHA macOS
10+
# no AVX2 or AVX512 on GHA macOS
1111
if [ "$(uname -s)" != "Darwin" ]; then
1212
export AVX2_CFLAGS=-mavx2
13+
# Temporarily disable AVX512; it is not available in CI yet.
14+
# export AVX512_CFLAGS="-mavx512vl -mavx512vbmi"
1315
fi
1416
elif [ "${MACHINE}" == "aarch64" ]; then
1517
export NEON64_CFLAGS="-march=armv8-a"

‎deps/base64/base64/test/codec_supported.c

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ static char *_codecs[] =
1111
, "SSE41"
1212
, "SSE42"
1313
, "AVX"
14+
, "AVX512"
1415
, NULL
1516
} ;
1617

‎deps/base64/base64/test/test_base64.c

+26-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include <stdbool.h>
22
#include <string.h>
33
#include <stdio.h>
4+
#include <stdlib.h>
45
#include "../include/libbase64.h"
56
#include "codec_supported.h"
67
#include "moby_dick.h"
@@ -92,7 +93,7 @@ assert_roundtrip (int flags, const char *src)
9293
}
9394

9495
static int
95-
test_char_table (int flags)
96+
test_char_table (int flags, bool use_malloc)
9697
{
9798
bool fail = false;
9899
char chr[256];
@@ -107,8 +108,24 @@ test_char_table (int flags)
107108
for (int i = 0; i < 256; i++) {
108109

109110
size_t chrlen = 256 - i;
111+
char* src = &chr[i];
112+
if (use_malloc) {
113+
src = malloc(chrlen); /* malloc/copy this so valgrind can find out-of-bound access */
114+
if (src == NULL) {
115+
printf(
116+
"FAIL: encoding @ %d: allocation of %lu bytes failed\n",
117+
i, (unsigned long)chrlen
118+
);
119+
fail = true;
120+
continue;
121+
}
122+
memcpy(src, &chr[i], chrlen);
123+
}
110124

111-
base64_encode(&chr[i], chrlen, enc, &enclen, BASE64_FORCE_PLAIN);
125+
base64_encode(src, chrlen, enc, &enclen, flags);
126+
if (use_malloc) {
127+
free(src);
128+
}
112129

113130
if (!base64_decode(enc, enclen, dec, &declen, flags)) {
114131
printf("FAIL: decoding @ %d: decoding error\n", i);
@@ -198,6 +215,11 @@ test_streaming (int flags)
198215
while (base64_stream_decode(&state, &ref[inpos], (inpos + bs > reflen) ? reflen - inpos : bs, &enc[enclen], &partlen)) {
199216
enclen += partlen;
200217
inpos += bs;
218+
219+
// Has the entire buffer been consumed?
220+
if (inpos >= 400) {
221+
break;
222+
}
201223
}
202224
if (enclen != 256) {
203225
printf("FAIL: stream decoding gave incorrect size: "
@@ -336,7 +358,8 @@ test_one_codec (const char *codec, int flags)
336358
fail |= assert_roundtrip(flags, vec[i].out);
337359
}
338360

339-
fail |= test_char_table(flags);
361+
fail |= test_char_table(flags, false); /* test with unaligned input buffer */
362+
fail |= test_char_table(flags, true); /* test for out-of-bound input read */
340363
fail |= test_streaming(flags);
341364
fail |= test_invalid_dec_input(flags);
342365

‎src/base64_version.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
// Refer to tools/dep_updaters/update-base64.sh
33
#ifndef SRC_BASE64_VERSION_H_
44
#define SRC_BASE64_VERSION_H_
5-
#define BASE64_VERSION "0.5.0"
5+
#define BASE64_VERSION "0.5.1"
66
#endif // SRC_BASE64_VERSION_H_

0 commit comments

Comments
 (0)
Please sign in to comment.