|
41 | 41 | * [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
|
42 | 42 | * of the adler s1 s2 of uint32_t type (see adler32.c).
|
43 | 43 | */
|
44 |
| -/* Copyright (C) 2023 SiFive, Inc. All rights reserved. |
45 |
| - * For conditions of distribution and use, see copyright notice in zlib.h |
46 |
| - */ |
47 | 44 |
|
48 | 45 | #include "adler32_simd.h"
|
49 | 46 |
|
@@ -368,103 +365,92 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
|
368 | 365 |
|
369 | 366 | #elif defined(ADLER32_SIMD_RVV)
|
370 | 367 | #include <riscv_vector.h>
|
371 |
| -/* adler32_rvv.c - RVV version of Adler-32 |
372 |
| - * RVV 1.0 code contributed by Alex Chiang <alex.chiang@sifive.com> |
373 |
| - * on https://github.com/zlib-ng/zlib-ng/pull/1532 |
374 |
| - * Port from Simon Hosie's fork: |
375 |
| - * https://github.com/cloudflare/zlib/commit/40688b53c61cb9bfc36471acd2dc0800b7ebcab1 |
| 368 | + |
| 369 | +/* |
| 370 | + * Patch by Simon Hosie, from: |
| 371 | + * https://github.com/cloudflare/zlib/pull/55 |
376 | 372 | */
|
377 | 373 |
|
378 | 374 | uint32_t ZLIB_INTERNAL adler32_simd_( /* RVV */
|
379 | 375 | uint32_t adler,
|
380 | 376 | const unsigned char *buf,
|
381 | 377 | unsigned long len)
|
382 | 378 | {
|
383 |
| - /* split Adler-32 into component sums */ |
384 |
| - uint32_t sum2 = (adler >> 16) & 0xffff; |
385 |
| - adler &= 0xffff; |
386 |
| - |
387 |
| - size_t left = len; |
388 |
| - size_t vl = __riscv_vsetvlmax_e8m1(); |
389 |
| - vl = vl > 256 ? 256 : vl; |
390 |
| - vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl); |
391 |
| - vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl); |
392 |
| - vuint16m2_t v_buf16_accu; |
393 |
| - |
394 |
| - /* |
395 |
| - * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator. |
396 |
| - * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit |
397 |
| - * accumulators to boost performance. |
398 |
| - * |
399 |
| - * The block_size is the largest multiple of vl that <= 256, because overflow would occur when |
400 |
| - * vl > 256 (255 * 256 <= UINT16_MAX). |
401 |
| - * |
402 |
| - * We accumulate 8-bit data into a 16-bit accumulator and then |
403 |
| - * move the data into the 32-bit accumulator at the last iteration. |
| 379 | + size_t vl = __riscv_vsetvlmax_e8m2(); |
| 380 | + const vuint16m4_t zero16 = __riscv_vmv_v_x_u16m4(0, vl); |
| 381 | + vuint16m4_t a_sum = zero16; |
| 382 | + vuint32m8_t b_sum = __riscv_vmv_v_x_u32m8(0, vl); |
| 383 | + |
| 384 | + /* Deal with the part which is not a multiple of vl first; because it's |
| 385 | + * easier to zero-stuff the beginning of the checksum than it is to tweak the |
| 386 | + * multipliers and sums for odd lengths afterwards. |
| 387 | + */ |
| 388 | + size_t head = len & (vl - 1); |
| 389 | + if (head > 0) { |
| 390 | + vuint8m2_t zero8 = __riscv_vmv_v_x_u8m2(0, vl); |
| 391 | + vuint8m2_t in = __riscv_vle8_v_u8m2(buf, vl); |
| 392 | + in = __riscv_vslideup(zero8, in, vl - head, vl); |
| 393 | + vuint16m4_t in16 = __riscv_vwcvtu_x(in, vl); |
| 394 | + a_sum = in16; |
| 395 | + buf += head; |
| 396 | + } |
| 397 | + |
| 398 | + /* We have a 32-bit accumulator, and in each iteration we add 22-times a |
| 399 | + * 16-bit value, plus another 16-bit value. We periodically subtract up to |
| 400 | + * 65535 times BASE to avoid overflow. b_overflow estimates how often we |
| 401 | + * need to do this subtraction. |
| 402 | + */ |
| 403 | + const int b_overflow = BASE / 23; |
| 404 | + int fixup = b_overflow; |
| 405 | + ssize_t iters = (len - head) / vl; |
| 406 | + while (iters > 0) { |
| 407 | + const vuint16m4_t a_overflow = __riscv_vrsub(a_sum, BASE, vl); |
| 408 | + int batch = iters < 22 ? iters : 22; |
| 409 | + iters -= batch; |
| 410 | + b_sum = __riscv_vwmaccu(b_sum, batch, a_sum, vl); |
| 411 | + vuint16m4_t a_batch = zero16, b_batch = zero16; |
| 412 | + |
| 413 | + /* Do a short batch, where neither a_sum nor b_sum can overflow a 16-bit |
| 414 | + * register. Then add them back into the main accumulators. |
404 | 415 | */
|
405 |
| - size_t block_size = (256 / vl) * vl; |
406 |
| - size_t nmax_limit = (NMAX / block_size); |
407 |
| - size_t cnt = 0; |
408 |
| - while (left >= block_size) { |
409 |
| - v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); |
410 |
| - size_t subprob = block_size; |
411 |
| - while (subprob > 0) { |
412 |
| - vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl); |
413 |
| - v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); |
414 |
| - v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); |
415 |
| - buf += vl; |
416 |
| - subprob -= vl; |
417 |
| - } |
418 |
| - v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl); |
419 |
| - v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); |
420 |
| - left -= block_size; |
421 |
| - /* do modulo once each block of NMAX size */ |
422 |
| - if (++cnt >= nmax_limit) { |
423 |
| - v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); |
424 |
| - cnt = 0; |
425 |
| - } |
| 416 | + while (batch-- > 0) { |
| 417 | + vuint8m2_t in8 = __riscv_vle8_v_u8m2(buf, vl); |
| 418 | + buf += vl; |
| 419 | + b_batch = __riscv_vadd(b_batch, a_batch, vl); |
| 420 | + a_batch = __riscv_vwaddu_wv(a_batch, in8, vl); |
426 | 421 | }
|
427 |
| - /* the left len <= 256 now, we can use 16-bit accum safely */ |
428 |
| - v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); |
429 |
| - size_t res = left; |
430 |
| - while (left >= vl) { |
431 |
| - vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl); |
432 |
| - v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); |
433 |
| - v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); |
434 |
| - buf += vl; |
435 |
| - left -= vl; |
| 422 | + vbool4_t ov = __riscv_vmsgeu(a_batch, a_overflow, vl); |
| 423 | + a_sum = __riscv_vadd(a_sum, a_batch, vl); |
| 424 | + a_sum = __riscv_vadd_mu(ov, a_sum, a_sum, 65536 - BASE, vl); |
| 425 | + b_sum = __riscv_vwaddu_wv(b_sum, b_batch, vl); |
| 426 | + if (--fixup <= 0) { |
| 427 | + b_sum = __riscv_vnmsac(b_sum, BASE, __riscv_vsrl(b_sum, 16, vl), vl); |
| 428 | + fixup = b_overflow; |
436 | 429 | }
|
437 |
| - v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl); |
438 |
| - v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); |
439 |
| - v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); |
440 |
| - |
441 |
| - vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl); |
442 |
| - vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl); |
443 |
| - vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl); |
444 |
| - |
445 |
| - v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl); |
446 |
| - |
447 |
| - vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl); |
448 |
| - v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl); |
449 |
| - uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum); |
450 |
| - |
451 |
| - sum2 += (sum2_sum + adler * (len - left)); |
452 |
| - |
453 |
| - vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl); |
454 |
| - v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl); |
455 |
| - uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum); |
456 |
| - |
457 |
| - adler += adler_sum; |
458 |
| - |
459 |
| - while (left--) { |
460 |
| - adler += *buf++; |
461 |
| - sum2 += adler; |
462 |
| - } |
463 |
| - |
464 |
| - sum2 %= BASE; |
465 |
| - adler %= BASE; |
466 |
| - |
467 |
| - return adler | (sum2 << 16); |
| 430 | + } |
| 431 | + /* Adjust per-lane sums to have appropriate offsets from the end of the |
| 432 | + * buffer. |
| 433 | + */ |
| 434 | + const vuint16m4_t off = __riscv_vrsub(__riscv_vid_v_u16m4(vl), vl, vl); |
| 435 | + vuint16m4_t bsum16 = __riscv_vncvt_x(__riscv_vremu(b_sum, BASE, vl), vl); |
| 436 | + b_sum = __riscv_vadd(__riscv_vwmulu(a_sum, off, vl), |
| 437 | + __riscv_vwmulu(bsum16, vl, vl), vl); |
| 438 | + bsum16 = __riscv_vncvt_x(__riscv_vremu(b_sum, BASE, vl), vl); |
| 439 | + |
| 440 | + /* And finally, do a horizontal sum across the registers for the final |
| 441 | + * result. |
| 442 | + */ |
| 443 | + uint32_t a = adler & 0xffff; |
| 444 | + uint32_t b = ((adler >> 16) + a * (len % BASE)) % BASE; |
| 445 | + vuint32m1_t sca = __riscv_vmv_v_x_u32m1(a, 1); |
| 446 | + vuint32m1_t scb = __riscv_vmv_v_x_u32m1(b, 1); |
| 447 | + sca = __riscv_vwredsumu(a_sum, sca, vl); |
| 448 | + scb = __riscv_vwredsumu(bsum16, scb, vl); |
| 449 | + a = __riscv_vmv_x(sca); |
| 450 | + b = __riscv_vmv_x(scb); |
| 451 | + a %= BASE; |
| 452 | + b %= BASE; |
| 453 | + return (b << 16) | a; |
468 | 454 | }
|
469 | 455 |
|
470 | 456 | #endif /* ADLER32_SIMD_SSSE3 */
|
0 commit comments