Skip to content

Commit

Permalink
deps: update zlib to 337322d
Browse files Browse the repository at this point in the history
PR-URL: #48218
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Luigi Pinca <luigipinca@gmail.com>
  • Loading branch information
nodejs-github-bot authored and MoLow committed Jul 6, 2023
1 parent 7558ef3 commit 4d1c38b
Show file tree
Hide file tree
Showing 26 changed files with 1,273 additions and 28 deletions.
4 changes: 4 additions & 0 deletions deps/zlib/BUILD.gn
Expand Up @@ -515,6 +515,10 @@ if (build_with_chromium) {

data = [ "google/test/data/" ]

if (is_ios) {
bundle_deps = [ "google:zlib_pak_bundle_data" ]
}

deps = [
":zlib",
"google:compression_utils",
Expand Down
10 changes: 8 additions & 2 deletions deps/zlib/CMakeLists.txt
Expand Up @@ -3,7 +3,7 @@ set(CMAKE_ALLOW_LOOSE_LOOP_CONSTRUCTS ON)

project(zlib C)

set(VERSION "1.2.13")
set(VERSION "1.2.13.1")

set(INSTALL_BIN_DIR "${CMAKE_INSTALL_PREFIX}/bin" CACHE PATH "Installation directory for executables")
set(INSTALL_LIB_DIR "${CMAKE_INSTALL_PREFIX}/lib" CACHE PATH "Installation directory for libraries")
Expand All @@ -22,6 +22,7 @@ check_include_file(stdint.h HAVE_STDINT_H)
check_include_file(stddef.h HAVE_STDDEF_H)

option(ENABLE_SIMD_OPTIMIZATIONS "Enable all SIMD optimizations" OFF)
option(ENABLE_SIMD_AVX512 "Enable SIMD AXV512 optimizations" OFF)

# TODO(cavalcantii): add support for other OSes (e.g. Android, fuchsia, osx)
# and architectures (e.g. Arm).
Expand All @@ -30,8 +31,13 @@ if (ENABLE_SIMD_OPTIMIZATIONS)
add_definitions(-DADLER32_SIMD_SSSE3)
add_definitions(-DINFLATE_CHUNK_READ_64LE)
add_definitions(-DCRC32_SIMD_SSE42_PCLMUL)
if (ENABLE_SIMD_AVX512)
add_definitions(-DCRC32_SIMD_AVX512_PCLMUL)
add_compile_options(-mvpclmulqdq -msse2 -mavx512f -mpclmul)
else()
add_compile_options(-msse4.2 -mpclmul)
endif()
add_definitions(-DDEFLATE_SLIDE_HASH_SSE2)
add_compile_options(-msse4.2 -mpclmul)
# Required by CPU features detection code.
add_definitions(-DX86_NOT_WINDOWS)
# Apparently some environments (e.g. CentOS) require to explicitly link
Expand Down
2 changes: 2 additions & 0 deletions deps/zlib/contrib/optimizations/inflate.c
Expand Up @@ -257,6 +257,8 @@ int value;
struct inflate_state FAR *state;

if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
if (bits == 0)
return Z_OK;
state = (struct inflate_state FAR *)strm->state;
if (bits < 0) {
state->hold = 0;
Expand Down
Expand Up @@ -12,7 +12,7 @@

#include <fuzzer/FuzzedDataProvider.h>

#include "third_party/zlib/zlib.h"
#include "zlib.h"

// Fuzzer builds often have NDEBUG set, so roll our own assert macro.
#define ASSERT(cond) \
Expand Down
9 changes: 9 additions & 0 deletions deps/zlib/cpu_features.c
Expand Up @@ -31,6 +31,7 @@ int ZLIB_INTERNAL arm_cpu_enable_pmull = 0;
int ZLIB_INTERNAL x86_cpu_enable_sse2 = 0;
int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0;
int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0;

#ifndef CPU_NO_SIMD

Expand Down Expand Up @@ -138,6 +139,10 @@ static void _cpu_check_features(void)
/* On x86 we simply use a instruction to check the CPU features.
* (i.e. CPUID).
*/
#ifdef CRC32_SIMD_AVX512_PCLMUL
#include <immintrin.h>
#include <xsaveintrin.h>
#endif
static void _cpu_check_features(void)
{
int x86_cpu_has_sse2;
Expand All @@ -164,6 +169,10 @@ static void _cpu_check_features(void)
x86_cpu_enable_simd = x86_cpu_has_sse2 &&
x86_cpu_has_sse42 &&
x86_cpu_has_pclmulqdq;

#ifdef CRC32_SIMD_AVX512_PCLMUL
x86_cpu_enable_avx512 = _xgetbv(0) & 0x00000040;
#endif
}
#endif
#endif
Expand Down
1 change: 1 addition & 0 deletions deps/zlib/cpu_features.h
Expand Up @@ -14,5 +14,6 @@ extern int arm_cpu_enable_pmull;
extern int x86_cpu_enable_sse2;
extern int x86_cpu_enable_ssse3;
extern int x86_cpu_enable_simd;
extern int x86_cpu_enable_avx512;

void cpu_check_features(void);
14 changes: 13 additions & 1 deletion deps/zlib/crc32.c
Expand Up @@ -773,7 +773,19 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
}

#endif
#if defined(CRC32_SIMD_SSE42_PCLMUL)
#if defined(CRC32_SIMD_AVX512_PCLMUL)
if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) {
/* crc32 64-byte chunks */
z_size_t chunk_size = len & ~Z_CRC32_AVX512_CHUNKSIZE_MASK;
crc = ~crc32_avx512_simd_(buf, chunk_size, ~(uint32_t)crc);
/* check remaining data */
len -= chunk_size;
if (!len)
return crc;
/* Fall into the default crc32 for the remaining data. */
buf += chunk_size;
}
#elif defined(CRC32_SIMD_SSE42_PCLMUL)
if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
/* crc32 16-byte chunks */
z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
Expand Down
198 changes: 194 additions & 4 deletions deps/zlib/crc32_simd.c
Expand Up @@ -6,17 +6,207 @@
*/

#include "crc32_simd.h"

#if defined(CRC32_SIMD_SSE42_PCLMUL)
#if defined(CRC32_SIMD_AVX512_PCLMUL)

/*
* crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
* length must be at least 64, and a multiple of 16. Based on:
* crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
* length must be at least 256, and a multiple of 64. Based on:
*
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
*/

#include <emmintrin.h>
#include <smmintrin.h>
#include <wmmintrin.h>
#include <immintrin.h>

uint32_t ZLIB_INTERNAL crc32_avx512_simd_( /* AVX512+PCLMUL */
const unsigned char *buf,
z_size_t len,
uint32_t crc)
{
/*
* Definitions of the bit-reflected domain constants k1,k2,k3,k4
* are similar to those given at the end of the paper, and remaining
* constants and CRC32+Barrett polynomials remain unchanged.
*
* Replace the index of x from 128 to 512. As follows:
* k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
* k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
* k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
* k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
*/
static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
0x011542778a, 0x01322d1430,
0x011542778a, 0x01322d1430,
0x011542778a, 0x01322d1430 };
static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
0x0154442bd4, 0x01c6e41596,
0x0154442bd4, 0x01c6e41596,
0x0154442bd4, 0x01c6e41596 };
static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
__m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
__m128i a0, a1, a2, a3;

/*
* There's at least one block of 256.
*/
x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));

x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));

x0 = _mm512_load_si512((__m512i *)k1k2);

buf += 256;
len -= 256;

/*
* Parallel fold blocks of 256, if any.
*/
while (len >= 256)
{
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);


x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);

y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));

x1 = _mm512_xor_si512(x1, x5);
x2 = _mm512_xor_si512(x2, x6);
x3 = _mm512_xor_si512(x3, x7);
x4 = _mm512_xor_si512(x4, x8);

x1 = _mm512_xor_si512(x1, y5);
x2 = _mm512_xor_si512(x2, y6);
x3 = _mm512_xor_si512(x3, y7);
x4 = _mm512_xor_si512(x4, y8);

buf += 256;
len -= 256;
}

/*
* Fold into 512-bits.
*/
x0 = _mm512_load_si512((__m512i *)k3k4);

x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x2);
x1 = _mm512_xor_si512(x1, x5);

x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x3);
x1 = _mm512_xor_si512(x1, x5);

x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x4);
x1 = _mm512_xor_si512(x1, x5);

/*
* Single fold blocks of 64, if any.
*/
while (len >= 64)
{
x2 = _mm512_loadu_si512((__m512i *)buf);

x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x2);
x1 = _mm512_xor_si512(x1, x5);

buf += 64;
len -= 64;
}

/*
* Fold 512-bits to 384-bits.
*/
a0 = _mm_load_si128((__m128i *)k5k6);

a1 = _mm512_extracti32x4_epi32(x1, 0);
a2 = _mm512_extracti32x4_epi32(x1, 1);

a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);

a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);

/*
* Fold 384-bits to 256-bits.
*/
a2 = _mm512_extracti32x4_epi32(x1, 2);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);

/*
* Fold 256-bits to 128-bits.
*/
a2 = _mm512_extracti32x4_epi32(x1, 3);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);

/*
* Fold 128-bits to 64-bits.
*/
a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
a3 = _mm_setr_epi32(~0, 0, ~0, 0);
a1 = _mm_srli_si128(a1, 8);
a1 = _mm_xor_si128(a1, a2);

a0 = _mm_loadl_epi64((__m128i*)k7k8);
a2 = _mm_srli_si128(a1, 4);
a1 = _mm_and_si128(a1, a3);
a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_xor_si128(a1, a2);

/*
* Barret reduce to 32-bits.
*/
a0 = _mm_load_si128((__m128i*)poly);

a2 = _mm_and_si128(a1, a3);
a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
a2 = _mm_and_si128(a2, a3);
a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
a1 = _mm_xor_si128(a1, a2);

/*
* Return the crc32.
*/
return _mm_extract_epi32(a1, 1);
}

#elif defined(CRC32_SIMD_SSE42_PCLMUL)

/*
* crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
* length must be at least 64, and a multiple of 16.
*/

#include <emmintrin.h>
#include <smmintrin.h>
#include <wmmintrin.h>
Expand Down
6 changes: 6 additions & 0 deletions deps/zlib/crc32_simd.h
Expand Up @@ -19,12 +19,18 @@ uint32_t ZLIB_INTERNAL crc32_sse42_simd_(const unsigned char* buf,
z_size_t len,
uint32_t crc);

uint32_t ZLIB_INTERNAL crc32_avx512_simd_(const unsigned char* buf,
z_size_t len,
uint32_t crc);

/*
* crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c
* for computing the crc32 of an arbitrary length buffer.
*/
#define Z_CRC32_SSE42_MINIMUM_LENGTH 64
#define Z_CRC32_SSE42_CHUNKSIZE_MASK 15
#define Z_CRC32_AVX512_MINIMUM_LENGTH 256
#define Z_CRC32_AVX512_CHUNKSIZE_MASK 63

/*
* CRC32 checksums using ARMv8-a crypto instructions.
Expand Down
6 changes: 4 additions & 2 deletions deps/zlib/crc_folding.c
Expand Up @@ -435,7 +435,10 @@ unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s)
unsigned crc;
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;

CRC_LOAD(s)
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
__m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);
__m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);
__m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);

/*
* k1
Expand Down Expand Up @@ -491,7 +494,6 @@ unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s)

crc = _mm_extract_epi32(xmm_crc3, 2);
return ~crc;
CRC_SAVE(s)
}

#endif /* CRC32_SIMD_SSE42_PCLMUL */
5 changes: 3 additions & 2 deletions deps/zlib/deflate.c
Expand Up @@ -65,7 +65,7 @@
#endif

const char deflate_copyright[] =
" deflate 1.2.13 Copyright 1995-2022 Jean-loup Gailly and Mark Adler ";
" deflate 1.2.13.1 Copyright 1995-2022 Jean-loup Gailly and Mark Adler ";
/*
If you use the zlib library in a product, an acknowledgment is welcome
in the documentation of your product. If for some reason you cannot
Expand Down Expand Up @@ -774,7 +774,8 @@ uLong ZEXPORT deflateBound(strm, sourceLen)

/* if not default parameters, return one of the conservative bounds */
if (s->w_bits != 15 || s->hash_bits != 8 + 7)
return (s->w_bits <= s->hash_bits ? fixedlen : storelen) + wraplen;
return (s->w_bits <= s->hash_bits && s->level ? fixedlen : storelen) +
wraplen;

/* default settings: return tight bound for that case -- ~0.03% overhead
plus a small constant */
Expand Down

0 comments on commit 4d1c38b

Please sign in to comment.