|
| 1 | +// Apologies in advance for combining the preprocessor with inline assembly, |
| 2 | +// two notoriously gnarly parts of C, but it was necessary to avoid a lot of |
| 3 | +// code repetition. The preprocessor is used to template large sections of |
| 4 | +// inline assembly that differ only in the registers used. If the code was |
| 5 | +// written out by hand, it would become very large and hard to audit. |
| 6 | + |
| 7 | +// Generate a block of inline assembly that loads register R0 from memory. The |
| 8 | +// offset at which the register is loaded is set by the given round and a |
| 9 | +// constant offset. |
| 10 | +#define LOAD(R0, ROUND, OFFSET) \ |
| 11 | + "vlddqu ("#ROUND" * 24 + "#OFFSET")(%[src]), %["R0"] \n\t" |
| 12 | + |
| 13 | +// Generate a block of inline assembly that deinterleaves and shuffles register |
| 14 | +// R0 using preloaded constants. Outputs in R0 and R1. |
| 15 | +#define SHUF(R0, R1, R2) \ |
| 16 | + "vpshufb %[lut0], %["R0"], %["R1"] \n\t" \ |
| 17 | + "vpand %["R1"], %[msk0], %["R2"] \n\t" \ |
| 18 | + "vpand %["R1"], %[msk2], %["R1"] \n\t" \ |
| 19 | + "vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \ |
| 20 | + "vpmullw %["R1"], %[msk3], %["R1"] \n\t" \ |
| 21 | + "vpor %["R1"], %["R2"], %["R1"] \n\t" |
| 22 | + |
| 23 | +// Generate a block of inline assembly that takes R0 and R1 and translates |
| 24 | +// their contents to the base64 alphabet, using preloaded constants. |
| 25 | +#define TRAN(R0, R1, R2) \ |
| 26 | + "vpsubusb %[n51], %["R1"], %["R0"] \n\t" \ |
| 27 | + "vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \ |
| 28 | + "vpsubb %["R2"], %["R0"], %["R0"] \n\t" \ |
| 29 | + "vpshufb %["R0"], %[lut1], %["R2"] \n\t" \ |
| 30 | + "vpaddb %["R1"], %["R2"], %["R0"] \n\t" |
| 31 | + |
| 32 | +// Generate a block of inline assembly that stores the given register R0 at an |
| 33 | +// offset set by the given round. |
| 34 | +#define STOR(R0, ROUND) \ |
| 35 | + "vmovdqu %["R0"], ("#ROUND" * 32)(%[dst]) \n\t" |
| 36 | + |
| 37 | +// Generate a block of inline assembly that generates a single self-contained |
| 38 | +// encoder round: fetch the data, process it, and store the result. Then update |
| 39 | +// the source and destination pointers. |
| 40 | +#define ROUND() \ |
| 41 | + LOAD("a", 0, -4) \ |
| 42 | + SHUF("a", "b", "c") \ |
| 43 | + TRAN("a", "b", "c") \ |
| 44 | + STOR("a", 0) \ |
| 45 | + "add $24, %[src] \n\t" \ |
| 46 | + "add $32, %[dst] \n\t" |
| 47 | + |
| 48 | +// Define a macro that initiates a three-way interleaved encoding round by |
| 49 | +// preloading registers a, b and c from memory. |
| 50 | +// The register graph shows which registers are in use during each step, and |
| 51 | +// is a visual aid for choosing registers for that step. Symbol index: |
| 52 | +// |
| 53 | +// + indicates that a register is loaded by that step. |
| 54 | +// | indicates that a register is in use and must not be touched. |
| 55 | +// - indicates that a register is decommissioned by that step. |
| 56 | +// x indicates that a register is used as a temporary by that step. |
| 57 | +// V indicates that a register is an input or output to the macro. |
| 58 | +// |
| 59 | +#define ROUND_3_INIT() /* a b c d e f */ \ |
| 60 | + LOAD("a", 0, -4) /* + */ \ |
| 61 | + SHUF("a", "d", "e") /* | + x */ \ |
| 62 | + LOAD("b", 1, -4) /* | + | */ \ |
| 63 | + TRAN("a", "d", "e") /* | | - x */ \ |
| 64 | + LOAD("c", 2, -4) /* V V V */ |
| 65 | + |
| 66 | +// Define a macro that translates, shuffles and stores the input registers A, B |
| 67 | +// and C, and preloads registers D, E and F for the next round. |
| 68 | +// This macro can be arbitrarily daisy-chained by feeding output registers D, E |
| 69 | +// and F back into the next round as input registers A, B and C. The macro |
| 70 | +// carefully interleaves memory operations with data operations for optimal |
| 71 | +// pipelined performance. |
| 72 | + |
| 73 | +#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \ |
| 74 | + LOAD(D, (ROUND + 3), -4) /* V V V + */ \ |
| 75 | + SHUF(B, E, F) /* | | | | + x */ \ |
| 76 | + STOR(A, (ROUND + 0)) /* - | | | | */ \ |
| 77 | + TRAN(B, E, F) /* | | | - x */ \ |
| 78 | + LOAD(E, (ROUND + 4), -4) /* | | | + */ \ |
| 79 | + SHUF(C, A, F) /* + | | | | x */ \ |
| 80 | + STOR(B, (ROUND + 1)) /* | - | | | */ \ |
| 81 | + TRAN(C, A, F) /* - | | | x */ \ |
| 82 | + LOAD(F, (ROUND + 5), -4) /* | | | + */ \ |
| 83 | + SHUF(D, A, B) /* + x | | | | */ \ |
| 84 | + STOR(C, (ROUND + 2)) /* | - | | | */ \ |
| 85 | + TRAN(D, A, B) /* - x V V V */ |
| 86 | + |
| 87 | +// Define a macro that terminates a ROUND_3 macro by taking pre-loaded |
| 88 | +// registers D, E and F, and translating, shuffling and storing them. |
| 89 | +#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \ |
| 90 | + SHUF(E, A, B) /* + x V V V */ \ |
| 91 | + STOR(D, (ROUND + 3)) /* | - | | */ \ |
| 92 | + TRAN(E, A, B) /* - x | | */ \ |
| 93 | + SHUF(F, C, D) /* + x | | */ \ |
| 94 | + STOR(E, (ROUND + 4)) /* | - | */ \ |
| 95 | + TRAN(F, C, D) /* - x | */ \ |
| 96 | + STOR(F, (ROUND + 5)) /* - */ |
| 97 | + |
| 98 | +// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f. |
| 99 | +#define ROUND_3_A(ROUND) \ |
| 100 | + ROUND_3(ROUND, "a", "b", "c", "d", "e", "f") |
| 101 | + |
| 102 | +// Define a type B round. Inputs and outputs are swapped with regard to type A. |
| 103 | +#define ROUND_3_B(ROUND) \ |
| 104 | + ROUND_3(ROUND, "d", "e", "f", "a", "b", "c") |
| 105 | + |
| 106 | +// Terminating macro for a type A round. |
| 107 | +#define ROUND_3_A_LAST(ROUND) \ |
| 108 | + ROUND_3_A(ROUND) \ |
| 109 | + ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f") |
| 110 | + |
| 111 | +// Terminating macro for a type B round. |
| 112 | +#define ROUND_3_B_LAST(ROUND) \ |
| 113 | + ROUND_3_B(ROUND) \ |
| 114 | + ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c") |
| 115 | + |
| 116 | +// Suppress clang's warning that the literal string in the asm statement is |
| 117 | +// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99 |
| 118 | +// compilers). It may be true, but the goal here is not C99 portability. |
| 119 | +#pragma GCC diagnostic push |
| 120 | +#pragma GCC diagnostic ignored "-Woverlength-strings" |
| 121 | + |
| 122 | +static inline void |
| 123 | +enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) |
| 124 | +{ |
| 125 | + // For a clearer explanation of the algorithm used by this function, |
| 126 | + // please refer to the plain (not inline assembly) implementation. This |
| 127 | + // function follows the same basic logic. |
| 128 | + |
| 129 | + if (*slen < 32) { |
| 130 | + return; |
| 131 | + } |
| 132 | + |
| 133 | + // Process blocks of 24 bytes at a time. Because blocks are loaded 32 |
| 134 | + // bytes at a time an offset of -4, ensure that there will be at least |
| 135 | + // 4 remaining bytes after the last round, so that the final read will |
| 136 | + // not pass beyond the bounds of the input buffer. |
| 137 | + size_t rounds = (*slen - 4) / 24; |
| 138 | + |
| 139 | + *slen -= rounds * 24; // 24 bytes consumed per round |
| 140 | + *olen += rounds * 32; // 32 bytes produced per round |
| 141 | + |
| 142 | + // Pre-decrement the number of rounds to get the number of rounds |
| 143 | + // *after* the first round, which is handled as a special case. |
| 144 | + rounds--; |
| 145 | + |
| 146 | + // Number of times to go through the 36x loop. |
| 147 | + size_t loops = rounds / 36; |
| 148 | + |
| 149 | + // Number of rounds remaining after the 36x loop. |
| 150 | + rounds %= 36; |
| 151 | + |
| 152 | + // Lookup tables. |
| 153 | + const __m256i lut0 = _mm256_set_epi8( |
| 154 | + 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, |
| 155 | + 14, 15, 13, 14, 11, 12, 10, 11, 8, 9, 7, 8, 5, 6, 4, 5); |
| 156 | + |
| 157 | + const __m256i lut1 = _mm256_setr_epi8( |
| 158 | + 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0, |
| 159 | + 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); |
| 160 | + |
| 161 | + // Temporary registers. |
| 162 | + __m256i a, b, c, d, e; |
| 163 | + |
| 164 | + // Temporary register f doubles as the shift mask for the first round. |
| 165 | + __m256i f = _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6); |
| 166 | + |
| 167 | + __asm__ volatile ( |
| 168 | + |
| 169 | + // The first loop iteration requires special handling to ensure |
| 170 | + // that the read, which is normally done at an offset of -4, |
| 171 | + // does not underflow the buffer. Load the buffer at an offset |
| 172 | + // of 0 and permute the input to achieve the same effect. |
| 173 | + LOAD("a", 0, 0) |
| 174 | + "vpermd %[a], %[f], %[a] \n\t" |
| 175 | + |
| 176 | + // Perform the standard shuffling and translation steps. |
| 177 | + SHUF("a", "b", "c") |
| 178 | + TRAN("a", "b", "c") |
| 179 | + |
| 180 | + // Store the result and increment the source and dest pointers. |
| 181 | + "vmovdqu %[a], (%[dst]) \n\t" |
| 182 | + "add $24, %[src] \n\t" |
| 183 | + "add $32, %[dst] \n\t" |
| 184 | + |
| 185 | + // If there are 36 rounds or more, enter a 36x unrolled loop of |
| 186 | + // interleaved encoding rounds. The rounds interleave memory |
| 187 | + // operations (load/store) with data operations (table lookups, |
| 188 | + // etc) to maximize pipeline throughput. |
| 189 | + " test %[loops], %[loops] \n\t" |
| 190 | + " jz 18f \n\t" |
| 191 | + " jmp 36f \n\t" |
| 192 | + " \n\t" |
| 193 | + ".balign 64 \n\t" |
| 194 | + "36: " ROUND_3_INIT() |
| 195 | + " " ROUND_3_A( 0) |
| 196 | + " " ROUND_3_B( 3) |
| 197 | + " " ROUND_3_A( 6) |
| 198 | + " " ROUND_3_B( 9) |
| 199 | + " " ROUND_3_A(12) |
| 200 | + " " ROUND_3_B(15) |
| 201 | + " " ROUND_3_A(18) |
| 202 | + " " ROUND_3_B(21) |
| 203 | + " " ROUND_3_A(24) |
| 204 | + " " ROUND_3_B(27) |
| 205 | + " " ROUND_3_A_LAST(30) |
| 206 | + " add $(24 * 36), %[src] \n\t" |
| 207 | + " add $(32 * 36), %[dst] \n\t" |
| 208 | + " dec %[loops] \n\t" |
| 209 | + " jnz 36b \n\t" |
| 210 | + |
| 211 | + // Enter an 18x unrolled loop for rounds of 18 or more. |
| 212 | + "18: cmp $18, %[rounds] \n\t" |
| 213 | + " jl 9f \n\t" |
| 214 | + " " ROUND_3_INIT() |
| 215 | + " " ROUND_3_A(0) |
| 216 | + " " ROUND_3_B(3) |
| 217 | + " " ROUND_3_A(6) |
| 218 | + " " ROUND_3_B(9) |
| 219 | + " " ROUND_3_A_LAST(12) |
| 220 | + " sub $18, %[rounds] \n\t" |
| 221 | + " add $(24 * 18), %[src] \n\t" |
| 222 | + " add $(32 * 18), %[dst] \n\t" |
| 223 | + |
| 224 | + // Enter a 9x unrolled loop for rounds of 9 or more. |
| 225 | + "9: cmp $9, %[rounds] \n\t" |
| 226 | + " jl 6f \n\t" |
| 227 | + " " ROUND_3_INIT() |
| 228 | + " " ROUND_3_A(0) |
| 229 | + " " ROUND_3_B_LAST(3) |
| 230 | + " sub $9, %[rounds] \n\t" |
| 231 | + " add $(24 * 9), %[src] \n\t" |
| 232 | + " add $(32 * 9), %[dst] \n\t" |
| 233 | + |
| 234 | + // Enter a 6x unrolled loop for rounds of 6 or more. |
| 235 | + "6: cmp $6, %[rounds] \n\t" |
| 236 | + " jl 55f \n\t" |
| 237 | + " " ROUND_3_INIT() |
| 238 | + " " ROUND_3_A_LAST(0) |
| 239 | + " sub $6, %[rounds] \n\t" |
| 240 | + " add $(24 * 6), %[src] \n\t" |
| 241 | + " add $(32 * 6), %[dst] \n\t" |
| 242 | + |
| 243 | + // Dispatch the remaining rounds 0..5. |
| 244 | + "55: cmp $3, %[rounds] \n\t" |
| 245 | + " jg 45f \n\t" |
| 246 | + " je 3f \n\t" |
| 247 | + " cmp $1, %[rounds] \n\t" |
| 248 | + " jg 2f \n\t" |
| 249 | + " je 1f \n\t" |
| 250 | + " jmp 0f \n\t" |
| 251 | + |
| 252 | + "45: cmp $4, %[rounds] \n\t" |
| 253 | + " je 4f \n\t" |
| 254 | + |
| 255 | + // Block of non-interlaced encoding rounds, which can each |
| 256 | + // individually be jumped to. Rounds fall through to the next. |
| 257 | + "5: " ROUND() |
| 258 | + "4: " ROUND() |
| 259 | + "3: " ROUND() |
| 260 | + "2: " ROUND() |
| 261 | + "1: " ROUND() |
| 262 | + "0: \n\t" |
| 263 | + |
| 264 | + // Outputs (modified). |
| 265 | + : [rounds] "+r" (rounds), |
| 266 | + [loops] "+r" (loops), |
| 267 | + [src] "+r" (*s), |
| 268 | + [dst] "+r" (*o), |
| 269 | + [a] "=&x" (a), |
| 270 | + [b] "=&x" (b), |
| 271 | + [c] "=&x" (c), |
| 272 | + [d] "=&x" (d), |
| 273 | + [e] "=&x" (e), |
| 274 | + [f] "+x" (f) |
| 275 | +
|
| 276 | + // Inputs (not modified). |
| 277 | + : [lut0] "x" (lut0), |
| 278 | + [lut1] "x" (lut1), |
| 279 | + [msk0] "x" (_mm256_set1_epi32(0x0FC0FC00)), |
| 280 | + [msk1] "x" (_mm256_set1_epi32(0x04000040)), |
| 281 | + [msk2] "x" (_mm256_set1_epi32(0x003F03F0)), |
| 282 | + [msk3] "x" (_mm256_set1_epi32(0x01000010)), |
| 283 | + [n51] "x" (_mm256_set1_epi8(51)), |
| 284 | + [n25] "x" (_mm256_set1_epi8(25)) |
| 285 | + |
| 286 | + // Clobbers. |
| 287 | + : "cc", "memory" |
| 288 | + ); |
| 289 | +} |
| 290 | + |
| 291 | +#pragma GCC diagnostic pop |
0 commit comments