diff --git a/CHANGELOG.md b/CHANGELOG.md index 96f697c043ff..bcf6b104505a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ * [6179](https://github.com/grafana/loki/pull/6179) **chaudum**: Add new HTTP endpoint to delete ingester ring token file and shutdown process gracefully * [5997](https://github.com/grafana/loki/pull/5997) **simonswine**: Querier: parallize label queries to both stores. * [5406](https://github.com/grafana/loki/pull/5406) **ctovena**: Revise the configuration parameters that configure the usage report to grafana.com. +* [7263](https://github.com/grafana/loki/pull/7263) **bboreham**: Dependencies: klauspost/compress package to v1.15.11; improves performance. ##### Fixes * [7040](https://github.com/grafana/loki/pull/7040) **bakunowski**: Remove duplicated `loki_boltdb_shipper` prefix from `tables_upload_operation_total` metric. diff --git a/go.mod b/go.mod index f0c5c81411cb..16df8aef85bd 100644 --- a/go.mod +++ b/go.mod @@ -63,7 +63,7 @@ require ( github.com/jmespath/go-jmespath v0.4.0 github.com/joncrlsn/dque v2.2.1-0.20200515025108-956d14155fa2+incompatible github.com/json-iterator/go v1.1.12 - github.com/klauspost/compress v1.14.1 + github.com/klauspost/compress v1.15.11 github.com/klauspost/pgzip v1.2.5 github.com/mattn/go-ieproxy v0.0.1 github.com/minio/minio-go/v7 v7.0.24 diff --git a/go.sum b/go.sum index ca3ee26c9765..f8d6eac78792 100644 --- a/go.sum +++ b/go.sum @@ -1082,8 +1082,8 @@ github.com/klauspost/compress v1.9.5/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0 github.com/klauspost/compress v1.11.0/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs= github.com/klauspost/compress v1.13.4/go.mod h1:8dP1Hq4DHOhN9w426knH3Rhby4rFm6D8eO+e+Dq5Gzg= github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= -github.com/klauspost/compress v1.14.1 h1:hLQYb23E8/fO+1u53d02A97a8UnsddcvYzq4ERRU4ds= -github.com/klauspost/compress v1.14.1/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= +github.com/klauspost/compress v1.15.11 h1:Lcadnb3RKGin4FYM/orgq0qde+nc15E5Cbqg4B9Sx9c= +github.com/klauspost/compress v1.15.11/go.mod h1:QPwzmACJjUTFsnSHH934V6woptycfrDDJnH7hvFVbGM= github.com/klauspost/cpuid v1.2.3/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= github.com/klauspost/cpuid v1.3.1 h1:5JNjFYYQrZeKRJ0734q51WCEEn2huer72Dc7K+R/b6s= github.com/klauspost/cpuid v1.3.1/go.mod h1:bYW4mA6ZgKPob1/Dlai2LviZJO7KGI3uoWLd42rAQw4= diff --git a/vendor/github.com/klauspost/compress/.gitignore b/vendor/github.com/klauspost/compress/.gitignore index b35f8449bf28..d31b37815279 100644 --- a/vendor/github.com/klauspost/compress/.gitignore +++ b/vendor/github.com/klauspost/compress/.gitignore @@ -23,3 +23,10 @@ _testmain.go *.test *.prof /s2/cmd/_s2sx/sfx-exe + +# Linux perf files +perf.data +perf.data.old + +# gdb history +.gdb_history diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md index e8ff994f8bcb..3c00c1af9688 100644 --- a/vendor/github.com/klauspost/compress/README.md +++ b/vendor/github.com/klauspost/compress/README.md @@ -17,13 +17,136 @@ This package provides various compression algorithms. # changelog +* Sept 16, 2022 (v1.15.10) + + * zstd: Add [WithDecodeAllCapLimit](https://pkg.go.dev/github.com/klauspost/compress@v1.15.10/zstd#WithDecodeAllCapLimit) https://github.com/klauspost/compress/pull/649 + * Add Go 1.19 - deprecate Go 1.16 https://github.com/klauspost/compress/pull/651 + * flate: Improve level 5+6 compression https://github.com/klauspost/compress/pull/656 + * zstd: Improve "better" compresssion https://github.com/klauspost/compress/pull/657 + * s2: Improve "best" compression https://github.com/klauspost/compress/pull/658 + * s2: Improve "better" compression. https://github.com/klauspost/compress/pull/635 + * s2: Slightly faster non-assembly decompression https://github.com/klauspost/compress/pull/646 + * Use arrays for constant size copies https://github.com/klauspost/compress/pull/659 + +* July 21, 2022 (v1.15.9) + + * zstd: Fix decoder crash on amd64 (no BMI) on invalid input https://github.com/klauspost/compress/pull/645 + * zstd: Disable decoder extended memory copies (amd64) due to possible crashes https://github.com/klauspost/compress/pull/644 + * zstd: Allow single segments up to "max decoded size" by @klauspost in https://github.com/klauspost/compress/pull/643 + +* July 13, 2022 (v1.15.8) + + * gzip: fix stack exhaustion bug in Reader.Read https://github.com/klauspost/compress/pull/641 + * s2: Add Index header trim/restore https://github.com/klauspost/compress/pull/638 + * zstd: Optimize seqdeq amd64 asm by @greatroar in https://github.com/klauspost/compress/pull/636 + * zstd: Improve decoder memcopy https://github.com/klauspost/compress/pull/637 + * huff0: Pass a single bitReader pointer to asm by @greatroar in https://github.com/klauspost/compress/pull/634 + * zstd: Branchless getBits for amd64 w/o BMI2 by @greatroar in https://github.com/klauspost/compress/pull/640 + * gzhttp: Remove header before writing https://github.com/klauspost/compress/pull/639 + +* June 29, 2022 (v1.15.7) + + * s2: Fix absolute forward seeks https://github.com/klauspost/compress/pull/633 + * zip: Merge upstream https://github.com/klauspost/compress/pull/631 + * zip: Re-add zip64 fix https://github.com/klauspost/compress/pull/624 + * zstd: translate fseDecoder.buildDtable into asm by @WojciechMula in https://github.com/klauspost/compress/pull/598 + * flate: Faster histograms https://github.com/klauspost/compress/pull/620 + * deflate: Use compound hcode https://github.com/klauspost/compress/pull/622 + +* June 3, 2022 (v1.15.6) + * s2: Improve coding for long, close matches https://github.com/klauspost/compress/pull/613 + * s2c: Add Snappy/S2 stream recompression https://github.com/klauspost/compress/pull/611 + * zstd: Always use configured block size https://github.com/klauspost/compress/pull/605 + * zstd: Fix incorrect hash table placement for dict encoding in default https://github.com/klauspost/compress/pull/606 + * zstd: Apply default config to ZipDecompressor without options https://github.com/klauspost/compress/pull/608 + * gzhttp: Exclude more common archive formats https://github.com/klauspost/compress/pull/612 + * s2: Add ReaderIgnoreCRC https://github.com/klauspost/compress/pull/609 + * s2: Remove sanity load on index creation https://github.com/klauspost/compress/pull/607 + * snappy: Use dedicated function for scoring https://github.com/klauspost/compress/pull/614 + * s2c+s2d: Use official snappy framed extension https://github.com/klauspost/compress/pull/610 + +* May 25, 2022 (v1.15.5) + * s2: Add concurrent stream decompression https://github.com/klauspost/compress/pull/602 + * s2: Fix final emit oob read crash on amd64 https://github.com/klauspost/compress/pull/601 + * huff0: asm implementation of Decompress1X by @WojciechMula https://github.com/klauspost/compress/pull/596 + * zstd: Use 1 less goroutine for stream decoding https://github.com/klauspost/compress/pull/588 + * zstd: Copy literal in 16 byte blocks when possible https://github.com/klauspost/compress/pull/592 + * zstd: Speed up when WithDecoderLowmem(false) https://github.com/klauspost/compress/pull/599 + * zstd: faster next state update in BMI2 version of decode by @WojciechMula in https://github.com/klauspost/compress/pull/593 + * huff0: Do not check max size when reading table. https://github.com/klauspost/compress/pull/586 + * flate: Inplace hashing for level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/590 + + +* May 11, 2022 (v1.15.4) + * huff0: decompress directly into output by @WojciechMula in [#577](https://github.com/klauspost/compress/pull/577) + * inflate: Keep dict on stack [#581](https://github.com/klauspost/compress/pull/581) + * zstd: Faster decoding memcopy in asm [#583](https://github.com/klauspost/compress/pull/583) + * zstd: Fix ignored crc [#580](https://github.com/klauspost/compress/pull/580) + +* May 5, 2022 (v1.15.3) + * zstd: Allow to ignore checksum checking by @WojciechMula [#572](https://github.com/klauspost/compress/pull/572) + * s2: Fix incorrect seek for io.SeekEnd in [#575](https://github.com/klauspost/compress/pull/575) + +* Apr 26, 2022 (v1.15.2) + * zstd: Add x86-64 assembly for decompression on streams and blocks. Contributed by [@WojciechMula](https://github.com/WojciechMula). Typically 2x faster. [#528](https://github.com/klauspost/compress/pull/528) [#531](https://github.com/klauspost/compress/pull/531) [#545](https://github.com/klauspost/compress/pull/545) [#537](https://github.com/klauspost/compress/pull/537) + * zstd: Add options to ZipDecompressor and fixes [#539](https://github.com/klauspost/compress/pull/539) + * s2: Use sorted search for index [#555](https://github.com/klauspost/compress/pull/555) + * Minimum version is Go 1.16, added CI test on 1.18. + +* Mar 11, 2022 (v1.15.1) + * huff0: Add x86 assembly of Decode4X by @WojciechMula in [#512](https://github.com/klauspost/compress/pull/512) + * zstd: Reuse zip decoders in [#514](https://github.com/klauspost/compress/pull/514) + * zstd: Detect extra block data and report as corrupted in [#520](https://github.com/klauspost/compress/pull/520) + * zstd: Handle zero sized frame content size stricter in [#521](https://github.com/klauspost/compress/pull/521) + * zstd: Add stricter block size checks in [#523](https://github.com/klauspost/compress/pull/523) + +* Mar 3, 2022 (v1.15.0) + * zstd: Refactor decoder by @klauspost in [#498](https://github.com/klauspost/compress/pull/498) + * zstd: Add stream encoding without goroutines by @klauspost in [#505](https://github.com/klauspost/compress/pull/505) + * huff0: Prevent single blocks exceeding 16 bits by @klauspost in[#507](https://github.com/klauspost/compress/pull/507) + * flate: Inline literal emission by @klauspost in [#509](https://github.com/klauspost/compress/pull/509) + * gzhttp: Add zstd to transport by @klauspost in [#400](https://github.com/klauspost/compress/pull/400) + * gzhttp: Make content-type optional by @klauspost in [#510](https://github.com/klauspost/compress/pull/510) + +Both compression and decompression now supports "synchronous" stream operations. This means that whenever "concurrency" is set to 1, they will operate without spawning goroutines. + +Stream decompression is now faster on asynchronous, since the goroutine allocation much more effectively splits the workload. On typical streams this will typically use 2 cores fully for decompression. When a stream has finished decoding no goroutines will be left over, so decoders can now safely be pooled and still be garbage collected. + +While the release has been extensively tested, it is recommended to testing when upgrading. + +
+ See changes to v1.14.x + +* Feb 22, 2022 (v1.14.4) + * flate: Fix rare huffman only (-2) corruption. [#503](https://github.com/klauspost/compress/pull/503) + * zip: Update deprecated CreateHeaderRaw to correctly call CreateRaw by @saracen in [#502](https://github.com/klauspost/compress/pull/502) + * zip: don't read data descriptor early by @saracen in [#501](https://github.com/klauspost/compress/pull/501) #501 + * huff0: Use static decompression buffer up to 30% faster by @klauspost in [#499](https://github.com/klauspost/compress/pull/499) [#500](https://github.com/klauspost/compress/pull/500) + +* Feb 17, 2022 (v1.14.3) + * flate: Improve fastest levels compression speed ~10% more throughput. [#482](https://github.com/klauspost/compress/pull/482) [#489](https://github.com/klauspost/compress/pull/489) [#490](https://github.com/klauspost/compress/pull/490) [#491](https://github.com/klauspost/compress/pull/491) [#494](https://github.com/klauspost/compress/pull/494) [#478](https://github.com/klauspost/compress/pull/478) + * flate: Faster decompression speed, ~5-10%. [#483](https://github.com/klauspost/compress/pull/483) + * s2: Faster compression with Go v1.18 and amd64 microarch level 3+. [#484](https://github.com/klauspost/compress/pull/484) [#486](https://github.com/klauspost/compress/pull/486) + +* Jan 25, 2022 (v1.14.2) + * zstd: improve header decoder by @dsnet [#476](https://github.com/klauspost/compress/pull/476) + * zstd: Add bigger default blocks [#469](https://github.com/klauspost/compress/pull/469) + * zstd: Remove unused decompression buffer [#470](https://github.com/klauspost/compress/pull/470) + * zstd: Fix logically dead code by @ningmingxiao [#472](https://github.com/klauspost/compress/pull/472) + * flate: Improve level 7-9 [#471](https://github.com/klauspost/compress/pull/471) [#473](https://github.com/klauspost/compress/pull/473) + * zstd: Add noasm tag for xxhash [#475](https://github.com/klauspost/compress/pull/475) + * Jan 11, 2022 (v1.14.1) * s2: Add stream index in [#462](https://github.com/klauspost/compress/pull/462) * flate: Speed and efficiency improvements in [#439](https://github.com/klauspost/compress/pull/439) [#461](https://github.com/klauspost/compress/pull/461) [#455](https://github.com/klauspost/compress/pull/455) [#452](https://github.com/klauspost/compress/pull/452) [#458](https://github.com/klauspost/compress/pull/458) * zstd: Performance improvement in [#420]( https://github.com/klauspost/compress/pull/420) [#456](https://github.com/klauspost/compress/pull/456) [#437](https://github.com/klauspost/compress/pull/437) [#467](https://github.com/klauspost/compress/pull/467) [#468](https://github.com/klauspost/compress/pull/468) * zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464) * Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445) +
+
+ See changes to v1.13.x + * Aug 30, 2021 (v1.13.5) * gz/zlib/flate: Alias stdlib errors [#425](https://github.com/klauspost/compress/pull/425) * s2: Add block support to commandline tools [#413](https://github.com/klauspost/compress/pull/413) @@ -52,7 +175,12 @@ This package provides various compression algorithms. * Added [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp#gzip-handler) which allows wrapping HTTP servers and clients with GZIP compressors. * zstd: Detect short invalid signatures [#382](https://github.com/klauspost/compress/pull/382) * zstd: Spawn decoder goroutine only if needed. [#380](https://github.com/klauspost/compress/pull/380) +
+ +
+ See changes to v1.12.x + * May 25, 2021 (v1.12.3) * deflate: Better/faster Huffman encoding [#374](https://github.com/klauspost/compress/pull/374) * deflate: Allocate less for history. [#375](https://github.com/klauspost/compress/pull/375) @@ -74,9 +202,10 @@ This package provides various compression algorithms. * s2c/s2d/s2sx: Always truncate when writing files [#352](https://github.com/klauspost/compress/pull/352) * zstd: Reduce memory usage further when using [WithLowerEncoderMem](https://pkg.go.dev/github.com/klauspost/compress/zstd#WithLowerEncoderMem) [#346](https://github.com/klauspost/compress/pull/346) * s2: Fix potential problem with amd64 assembly and profilers [#349](https://github.com/klauspost/compress/pull/349) +
- See changes prior to v1.12.1 + See changes to v1.11.x * Mar 26, 2021 (v1.11.13) * zstd: Big speedup on small dictionary encodes [#344](https://github.com/klauspost/compress/pull/344) [#345](https://github.com/klauspost/compress/pull/345) @@ -135,7 +264,7 @@ This package provides various compression algorithms.
- See changes prior to v1.11.0 + See changes to v1.10.x * July 8, 2020 (v1.10.11) * zstd: Fix extra block when compressing with ReadFrom. [#278](https://github.com/klauspost/compress/pull/278) @@ -297,11 +426,6 @@ This package provides various compression algorithms. # deflate usage -* [High Throughput Benchmark](http://blog.klauspost.com/go-gzipdeflate-benchmarks/). -* [Small Payload/Webserver Benchmarks](http://blog.klauspost.com/gzip-performance-for-go-webservers/). -* [Linear Time Compression](http://blog.klauspost.com/constant-time-gzipzip-compression/). -* [Re-balancing Deflate Compression Levels](https://blog.klauspost.com/rebalancing-deflate-compression-levels/) - The packages are drop-in replacements for standard libraries. Simply replace the import path to use them: | old import | new import | Documentation @@ -323,6 +447,8 @@ Memory usage is typically 1MB for a Writer. stdlib is in the same range. If you expect to have a lot of concurrently allocated Writers consider using the stateless compress described below. +For compression performance, see: [this spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing). + # Stateless compression This package offers stateless compression as a special option for gzip/deflate. diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go index b27f5a93bc2b..07265ddede80 100644 --- a/vendor/github.com/klauspost/compress/flate/deflate.go +++ b/vendor/github.com/klauspost/compress/flate/deflate.go @@ -10,9 +10,6 @@ import ( "fmt" "io" "math" - "math/bits" - - comp "github.com/klauspost/compress" ) const ( @@ -76,8 +73,8 @@ var levels = []compressionLevel{ {0, 0, 0, 0, 0, 6}, // Levels 7-9 use increasingly more lazy matching // and increasingly stringent conditions for "good enough". - {6, 10, 12, 16, skipNever, 7}, - {10, 24, 32, 64, skipNever, 8}, + {8, 12, 16, 24, skipNever, 7}, + {16, 30, 40, 64, skipNever, 8}, {32, 258, 258, 1024, skipNever, 9}, } @@ -87,29 +84,29 @@ type advancedState struct { length int offset int maxInsertIndex int + chainHead int + hashOffset int - // Input hash chains - // hashHead[hashValue] contains the largest inputIndex with the specified hash value - // If hashHead[hashValue] is within the current window, then - // hashPrev[hashHead[hashValue] & windowMask] contains the previous index - // with the same hash value. - chainHead int - hashHead [hashSize]uint32 - hashPrev [windowSize]uint32 - hashOffset int + ii uint16 // position of last match, intended to overflow to reset. // input window: unprocessed data is window[index:windowEnd] index int estBitsPerByte int hashMatch [maxMatchLength + minMatchLength]uint32 - hash uint32 - ii uint16 // position of last match, intended to overflow to reset. + // Input hash chains + // hashHead[hashValue] contains the largest inputIndex with the specified hash value + // If hashHead[hashValue] is within the current window, then + // hashPrev[hashHead[hashValue] & windowMask] contains the previous index + // with the same hash value. + hashHead [hashSize]uint32 + hashPrev [windowSize]uint32 } type compressor struct { compressionLevel + h *huffmanEncoder w *huffmanBitWriter // compression algorithm @@ -134,7 +131,8 @@ func (d *compressor) fillDeflate(b []byte) int { s := d.state if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) { // shift the window by windowSize - copy(d.window[:], d.window[windowSize:2*windowSize]) + //copy(d.window[:], d.window[windowSize:2*windowSize]) + *(*[windowSize]byte)(d.window) = *(*[windowSize]byte)(d.window[windowSize:]) s.index -= windowSize d.windowEnd -= windowSize if d.blockStart >= windowSize { @@ -261,7 +259,6 @@ func (d *compressor) fillWindow(b []byte) { // Set the head of the hash chain to us. s.hashHead[newH] = uint32(di + s.hashOffset) } - s.hash = newH } // Update window information. d.windowEnd += n @@ -271,7 +268,7 @@ func (d *compressor) fillWindow(b []byte) { // Try to find a match starting at index whose length is greater than prevSize. // We only look at chainCount possibilities before giving up. // pos = s.index, prevHead = s.chainHead-s.hashOffset, prevLength=minMatchLength-1, lookahead -func (d *compressor) findMatch(pos int, prevHead int, lookahead, bpb int) (length, offset int, ok bool) { +func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, offset int, ok bool) { minMatchLook := maxMatchLength if lookahead < minMatchLook { minMatchLook = lookahead @@ -297,14 +294,46 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead, bpb int) (lengt } offset = 0 + cGain := 0 + if d.chain < 100 { + for i := prevHead; tries > 0; tries-- { + if wEnd == win[i+length] { + n := matchLen(win[i:i+minMatchLook], wPos) + if n > length { + length = n + offset = pos - i + ok = true + if n >= nice { + // The match is good enough that we don't try to find a better one. + break + } + wEnd = win[pos+n] + } + } + if i <= minIndex { + // hashPrev[i & windowMask] has already been overwritten, so stop now. + break + } + i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset + if i < minIndex { + break + } + } + return + } + + // Some like it higher (CSV), some like it lower (JSON) + const baseCost = 6 // Base is 4 bytes at with an additional cost. // Matches must be better than this. - cGain := minMatchLength*bpb - 12 for i := prevHead; tries > 0; tries-- { if wEnd == win[i+length] { n := matchLen(win[i:i+minMatchLook], wPos) if n > length { - newGain := n*bpb - bits.Len32(uint32(pos-i)) + // Calculate gain. Estimate + newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]]) + + //fmt.Println(n, "gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n])) if newGain > cGain { length = n offset = pos - i @@ -345,6 +374,12 @@ func hash4(b []byte) uint32 { return hash4u(binary.LittleEndian.Uint32(b), hashBits) } +// hash4 returns the hash of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <32. +func hash4u(u uint32, h uint8) uint32 { + return (u * prime4bytes) >> (32 - h) +} + // bulkHash4 will compute hashes using the same // algorithm as hash4 func bulkHash4(b []byte, dst []uint32) { @@ -373,7 +408,6 @@ func (d *compressor) initDeflate() { s.hashOffset = 1 s.length = minMatchLength - 1 s.offset = 0 - s.hash = 0 s.chainHead = -1 } @@ -389,16 +423,19 @@ func (d *compressor) deflateLazy() { if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync { return } - s.estBitsPerByte = 8 - if !d.sync { - s.estBitsPerByte = comp.ShannonEntropyBits(d.window[s.index:d.windowEnd]) - s.estBitsPerByte = int(1 + float64(s.estBitsPerByte)/float64(d.windowEnd-s.index)) + if d.windowEnd != s.index && d.chain > 100 { + // Get literal huffman coder. + if d.h == nil { + d.h = newHuffmanEncoder(maxFlateBlockTokens) + } + var tmp [256]uint16 + for _, v := range d.window[s.index:d.windowEnd] { + tmp[v]++ + } + d.h.generate(tmp[:], 15) } s.maxInsertIndex = d.windowEnd - (minMatchLength - 1) - if s.index < s.maxInsertIndex { - s.hash = hash4(d.window[s.index:]) - } for { if sanity && s.index > d.windowEnd { @@ -430,11 +467,11 @@ func (d *compressor) deflateLazy() { } if s.index < s.maxInsertIndex { // Update the hash - s.hash = hash4(d.window[s.index:]) - ch := s.hashHead[s.hash&hashMask] + hash := hash4(d.window[s.index:]) + ch := s.hashHead[hash] s.chainHead = int(ch) s.hashPrev[s.index&windowMask] = ch - s.hashHead[s.hash&hashMask] = uint32(s.index + s.hashOffset) + s.hashHead[hash] = uint32(s.index + s.hashOffset) } prevLength := s.length prevOffset := s.offset @@ -446,7 +483,7 @@ func (d *compressor) deflateLazy() { } if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy { - if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, lookahead, s.estBitsPerByte); ok { + if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, lookahead); ok { s.length = newLength s.offset = newOffset } @@ -467,7 +504,7 @@ func (d *compressor) deflateLazy() { end += prevIndex idx := prevIndex + prevLength - (4 - checkOff) h := hash4(d.window[idx:]) - ch2 := int(s.hashHead[h&hashMask]) - s.hashOffset - prevLength + (4 - checkOff) + ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength + (4 - checkOff) if ch2 > minIndex { length := matchLen(d.window[prevIndex:end], d.window[ch2:]) // It seems like a pure length metric is best. @@ -511,7 +548,6 @@ func (d *compressor) deflateLazy() { // Set the head of the hash chain to us. s.hashHead[newH] = uint32(di + s.hashOffset) } - s.hash = newH } s.index = newIndex @@ -757,7 +793,6 @@ func (d *compressor) reset(w io.Writer) { d.tokens.Reset() s.length = minMatchLength - 1 s.offset = 0 - s.hash = 0 s.ii = 0 s.maxInsertIndex = 0 } diff --git a/vendor/github.com/klauspost/compress/flate/dict_decoder.go b/vendor/github.com/klauspost/compress/flate/dict_decoder.go index 71c75a065ea7..bb36351a5af3 100644 --- a/vendor/github.com/klauspost/compress/flate/dict_decoder.go +++ b/vendor/github.com/klauspost/compress/flate/dict_decoder.go @@ -7,19 +7,19 @@ package flate // dictDecoder implements the LZ77 sliding dictionary as used in decompression. // LZ77 decompresses data through sequences of two forms of commands: // -// * Literal insertions: Runs of one or more symbols are inserted into the data -// stream as is. This is accomplished through the writeByte method for a -// single symbol, or combinations of writeSlice/writeMark for multiple symbols. -// Any valid stream must start with a literal insertion if no preset dictionary -// is used. +// - Literal insertions: Runs of one or more symbols are inserted into the data +// stream as is. This is accomplished through the writeByte method for a +// single symbol, or combinations of writeSlice/writeMark for multiple symbols. +// Any valid stream must start with a literal insertion if no preset dictionary +// is used. // -// * Backward copies: Runs of one or more symbols are copied from previously -// emitted data. Backward copies come as the tuple (dist, length) where dist -// determines how far back in the stream to copy from and length determines how -// many bytes to copy. Note that it is valid for the length to be greater than -// the distance. Since LZ77 uses forward copies, that situation is used to -// perform a form of run-length encoding on repeated runs of symbols. -// The writeCopy and tryWriteCopy are used to implement this command. +// - Backward copies: Runs of one or more symbols are copied from previously +// emitted data. Backward copies come as the tuple (dist, length) where dist +// determines how far back in the stream to copy from and length determines how +// many bytes to copy. Note that it is valid for the length to be greater than +// the distance. Since LZ77 uses forward copies, that situation is used to +// perform a form of run-length encoding on repeated runs of symbols. +// The writeCopy and tryWriteCopy are used to implement this command. // // For performance reasons, this implementation performs little to no sanity // checks about the arguments. As such, the invariants documented for each diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go index 0b2e54972cdd..24caf5f70b00 100644 --- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go +++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go @@ -58,17 +58,6 @@ const ( prime8bytes = 0xcf1bbcdcb7a56463 ) -func load32(b []byte, i int) uint32 { - // Help the compiler eliminate bounds checks on the read so it can be done in a single read. - b = b[i:] - b = b[:4] - return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 -} - -func load64(b []byte, i int) uint64 { - return binary.LittleEndian.Uint64(b[i:]) -} - func load3232(b []byte, i int32) uint32 { return binary.LittleEndian.Uint32(b[i:]) } @@ -77,10 +66,6 @@ func load6432(b []byte, i int32) uint64 { return binary.LittleEndian.Uint64(b[i:]) } -func hash(u uint32) uint32 { - return (u * 0x1e35a7bd) >> tableShift -} - type tableEntry struct { offset int32 } @@ -104,7 +89,8 @@ func (e *fastGen) addBlock(src []byte) int32 { } // Move down offset := int32(len(e.hist)) - maxMatchOffset - copy(e.hist[0:maxMatchOffset], e.hist[offset:]) + // copy(e.hist[0:maxMatchOffset], e.hist[offset:]) + *(*[maxMatchOffset]byte)(e.hist) = *(*[maxMatchOffset]byte)(e.hist[offset:]) e.cur += offset e.hist = e.hist[:maxMatchOffset] } @@ -114,39 +100,36 @@ func (e *fastGen) addBlock(src []byte) int32 { return s } -// hash4 returns the hash of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <32. -func hash4u(u uint32, h uint8) uint32 { - return (u * prime4bytes) >> ((32 - h) & reg8SizeMask32) -} - type tableEntryPrev struct { Cur tableEntry Prev tableEntry } -// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <32. -func hash4x64(u uint64, h uint8) uint32 { - return (uint32(u) * prime4bytes) >> ((32 - h) & reg8SizeMask32) -} - // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits. // Preferably h should be a constant and should always be <64. func hash7(u uint64, h uint8) uint32 { return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & reg8SizeMask64)) } -// hash8 returns the hash of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <64. -func hash8(u uint64, h uint8) uint32 { - return uint32((u * prime8bytes) >> ((64 - h) & reg8SizeMask64)) -} - -// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <64. -func hash6(u uint64, h uint8) uint32 { - return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & reg8SizeMask64)) +// hashLen returns a hash of the lowest mls bytes of with length output bits. +// mls must be >=3 and <=8. Any other value will return hash for 4 bytes. +// length should always be < 32. +// Preferably length and mls should be a constant for inlining. +func hashLen(u uint64, length, mls uint8) uint32 { + switch mls { + case 3: + return (uint32(u<<8) * prime3bytes) >> (32 - length) + case 5: + return uint32(((u << (64 - 40)) * prime5bytes) >> (64 - length)) + case 6: + return uint32(((u << (64 - 48)) * prime6bytes) >> (64 - length)) + case 7: + return uint32(((u << (64 - 56)) * prime7bytes) >> (64 - length)) + case 8: + return uint32((u * prime8bytes) >> (64 - length)) + default: + return (uint32(u) * prime4bytes) >> (32 - length) + } } // matchlen will return the match length between offsets and t in src. @@ -179,7 +162,7 @@ func (e *fastGen) matchlen(s, t int32, src []byte) int32 { // matchlenLong will return the match length between offsets and t in src. // It is assumed that s > t, that t >=0 and s < len(src). func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 { - if debugDecode { + if debugDeflate { if t >= s { panic(fmt.Sprint("t >=s:", t, s)) } diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go index fb1701eeccea..89a5dd89f983 100644 --- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go +++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go @@ -8,6 +8,7 @@ import ( "encoding/binary" "fmt" "io" + "math" ) const ( @@ -24,6 +25,10 @@ const ( codegenCodeCount = 19 badCode = 255 + // maxPredefinedTokens is the maximum number of tokens + // where we check if fixed size is smaller. + maxPredefinedTokens = 250 + // bufferFlushSize indicates the buffer size // after which bytes are flushed to the writer. // Should preferably be a multiple of 6, since @@ -36,8 +41,11 @@ const ( bufferSize = bufferFlushSize + 8 ) +// Minimum length code that emits bits. +const lengthExtraBitsMinCode = 8 + // The number of extra bits needed by length code X - LENGTH_CODES_START. -var lengthExtraBits = [32]int8{ +var lengthExtraBits = [32]uint8{ /* 257 */ 0, 0, 0, /* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, /* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, @@ -51,19 +59,22 @@ var lengthBase = [32]uint8{ 64, 80, 96, 112, 128, 160, 192, 224, 255, } +// Minimum offset code that emits bits. +const offsetExtraBitsMinCode = 4 + // offset code word extra bits. -var offsetExtraBits = [64]int8{ +var offsetExtraBits = [32]int8{ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, /* extended window */ - 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, + 14, 14, } var offsetCombined = [32]uint32{} func init() { - var offsetBase = [64]uint32{ + var offsetBase = [32]uint32{ /* normal deflate */ 0x000000, 0x000001, 0x000002, 0x000003, 0x000004, 0x000006, 0x000008, 0x00000c, 0x000010, 0x000018, @@ -73,17 +84,15 @@ func init() { 0x001800, 0x002000, 0x003000, 0x004000, 0x006000, /* extended window */ - 0x008000, 0x00c000, 0x010000, 0x018000, 0x020000, - 0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000, - 0x100000, 0x180000, 0x200000, 0x300000, + 0x008000, 0x00c000, } for i := range offsetCombined[:] { // Don't use extended window values... - if offsetBase[i] > 0x006000 { + if offsetExtraBits[i] == 0 || offsetBase[i] > 0x006000 { continue } - offsetCombined[i] = uint32(offsetExtraBits[i])<<16 | (offsetBase[i]) + offsetCombined[i] = uint32(offsetExtraBits[i]) | (offsetBase[i] << 8) } } @@ -99,7 +108,7 @@ type huffmanBitWriter struct { // Data waiting to be written is bytes[0:nbytes] // and then the low nbits of bits. bits uint64 - nbits uint16 + nbits uint8 nbytes uint8 lastHuffMan bool literalEncoding *huffmanEncoder @@ -160,7 +169,7 @@ func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) { b := w.offsetEncoding.codes b = b[:len(a)] for i, v := range a { - if v != 0 && b[i].len == 0 { + if v != 0 && b[i].zero() { return false } } @@ -169,7 +178,7 @@ func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) { b = w.literalEncoding.codes[256:literalCount] b = b[:len(a)] for i, v := range a { - if v != 0 && b[i].len == 0 { + if v != 0 && b[i].zero() { return false } } @@ -177,7 +186,7 @@ func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) { a = t.litHist[:256] b = w.literalEncoding.codes[:len(a)] for i, v := range a { - if v != 0 && b[i].len == 0 { + if v != 0 && b[i].zero() { return false } } @@ -217,7 +226,7 @@ func (w *huffmanBitWriter) write(b []byte) { _, w.err = w.writer.Write(b) } -func (w *huffmanBitWriter) writeBits(b int32, nb uint16) { +func (w *huffmanBitWriter) writeBits(b int32, nb uint8) { w.bits |= uint64(b) << (w.nbits & 63) w.nbits += nb if w.nbits >= 48 { @@ -256,9 +265,9 @@ func (w *huffmanBitWriter) writeBytes(bytes []byte) { // Codes 0-15 are single byte codes. Codes 16-18 are followed by additional // information. Code badCode is an end marker // -// numLiterals The number of literals in literalEncoding -// numOffsets The number of offsets in offsetEncoding -// litenc, offenc The literal and offset encoder to use +// numLiterals The number of literals in literalEncoding +// numOffsets The number of offsets in offsetEncoding +// litenc, offenc The literal and offset encoder to use func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc, offEnc *huffmanEncoder) { for i := range w.codegenFreq { w.codegenFreq[i] = 0 @@ -271,12 +280,12 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE // Copy the concatenated code sizes to codegen. Put a marker at the end. cgnl := codegen[:numLiterals] for i := range cgnl { - cgnl[i] = uint8(litEnc.codes[i].len) + cgnl[i] = litEnc.codes[i].len() } cgnl = codegen[numLiterals : numLiterals+numOffsets] for i := range cgnl { - cgnl[i] = uint8(offEnc.codes[i].len) + cgnl[i] = offEnc.codes[i].len() } codegen[numLiterals+numOffsets] = badCode @@ -419,8 +428,8 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) { func (w *huffmanBitWriter) writeCode(c hcode) { // The function does not get inlined if we "& 63" the shift. - w.bits |= uint64(c.code) << (w.nbits & 63) - w.nbits += c.len + w.bits |= c.code64() << (w.nbits & 63) + w.nbits += c.len() if w.nbits >= 48 { w.writeOutBits() } @@ -451,9 +460,9 @@ func (w *huffmanBitWriter) writeOutBits() { // Write the header of a dynamic Huffman block to the output stream. // -// numLiterals The number of literals specified in codegen -// numOffsets The number of offsets specified in codegen -// numCodegens The number of codegens used in codegen +// numLiterals The number of literals specified in codegen +// numOffsets The number of offsets specified in codegen +// numCodegens The number of codegens used in codegen func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) { if w.err != nil { return @@ -468,7 +477,7 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n w.writeBits(int32(numCodegens-4), 4) for i := 0; i < numCodegens; i++ { - value := uint(w.codegenEncoding.codes[codegenOrder[i]].len) + value := uint(w.codegenEncoding.codes[codegenOrder[i]].len()) w.writeBits(int32(value), 3) } @@ -573,7 +582,10 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) { // Fixed Huffman baseline. var literalEncoding = fixedLiteralEncoding var offsetEncoding = fixedOffsetEncoding - var size = w.fixedSize(extraBits) + var size = math.MaxInt32 + if tokens.n < maxPredefinedTokens { + size = w.fixedSize(extraBits) + } // Dynamic Huffman? var numCodegens int @@ -658,7 +670,7 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b // Estimate size for using a new table. // Use the previous header size as the best estimate. newSize := w.lastHeader + tokens.EstimatedBits() - newSize += int(w.literalEncoding.codes[endBlockMarker].len) + newSize>>w.logNewTablePenalty + newSize += int(w.literalEncoding.codes[endBlockMarker].len()) + newSize>>w.logNewTablePenalty // The estimated size is calculated as an optimal table. // We add a penalty to make it more realistic and re-use a bit more. @@ -674,19 +686,21 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b size = reuseSize } - if preSize := w.fixedSize(extraBits) + 7; usePrefs && preSize < size { - // Check if we get a reasonable size decrease. - if storable && ssize <= size { - w.writeStoredHeader(len(input), eof) - w.writeBytes(input) + if tokens.n < maxPredefinedTokens { + if preSize := w.fixedSize(extraBits) + 7; usePrefs && preSize < size { + // Check if we get a reasonable size decrease. + if storable && ssize <= size { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } + w.writeFixedHeader(eof) + if !sync { + tokens.AddEOB() + } + w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes) return } - w.writeFixedHeader(eof) - if !sync { - tokens.AddEOB() - } - w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes) - return } // Check if we get a reasonable size decrease. if storable && ssize <= size { @@ -719,19 +733,21 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits) // Store predefined, if we don't get a reasonable improvement. - if preSize := w.fixedSize(extraBits); usePrefs && preSize <= size { - // Store bytes, if we don't get an improvement. - if storable && ssize <= preSize { - w.writeStoredHeader(len(input), eof) - w.writeBytes(input) + if tokens.n < maxPredefinedTokens { + if preSize := w.fixedSize(extraBits); usePrefs && preSize <= size { + // Store bytes, if we don't get an improvement. + if storable && ssize <= preSize { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return + } + w.writeFixedHeader(eof) + if !sync { + tokens.AddEOB() + } + w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes) return } - w.writeFixedHeader(eof) - if !sync { - tokens.AddEOB() - } - w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes) - return } if storable && ssize <= size { @@ -774,9 +790,11 @@ func (w *huffmanBitWriter) fillTokens() { // and offsetEncoding. // The number of literal and offset tokens is returned. func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, numOffsets int) { - copy(w.literalFreq[:], t.litHist[:]) - copy(w.literalFreq[256:], t.extraHist[:]) - copy(w.offsetFreq[:], t.offHist[:offsetCodeCount]) + //copy(w.literalFreq[:], t.litHist[:]) + *(*[256]uint16)(w.literalFreq[:]) = t.litHist + //copy(w.literalFreq[256:], t.extraHist[:]) + *(*[32]uint16)(w.literalFreq[256:]) = t.extraHist + w.offsetFreq = t.offHist if t.n == 0 { return @@ -835,11 +853,11 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) bits, nbits, nbytes := w.bits, w.nbits, w.nbytes for _, t := range tokens { - if t < matchType { + if t < 256 { //w.writeCode(lits[t.literal()]) - c := lits[t.literal()] - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len + c := lits[t] + bits |= c.code64() << (nbits & 63) + nbits += c.len() if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits @@ -860,14 +878,14 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) // Write the length length := t.length() - lengthCode := lengthCode(length) + lengthCode := lengthCode(length) & 31 if false { - w.writeCode(lengths[lengthCode&31]) + w.writeCode(lengths[lengthCode]) } else { // inlined - c := lengths[lengthCode&31] - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len + c := lengths[lengthCode] + bits |= c.code64() << (nbits & 63) + nbits += c.len() if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits @@ -885,10 +903,10 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } } - extraLengthBits := uint16(lengthExtraBits[lengthCode&31]) - if extraLengthBits > 0 { + if lengthCode >= lengthExtraBitsMinCode { + extraLengthBits := lengthExtraBits[lengthCode] //w.writeBits(extraLength, extraLengthBits) - extraLength := int32(length - lengthBase[lengthCode&31]) + extraLength := int32(length - lengthBase[lengthCode]) bits |= uint64(extraLength) << (nbits & 63) nbits += extraLengthBits if nbits >= 48 { @@ -909,15 +927,14 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } // Write the offset offset := t.offset() - offsetCode := offset >> 16 - offset &= matchOffsetOnlyMask + offsetCode := (offset >> 16) & 31 if false { - w.writeCode(offs[offsetCode&31]) + w.writeCode(offs[offsetCode]) } else { // inlined c := offs[offsetCode] - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len + bits |= c.code64() << (nbits & 63) + nbits += c.len() if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits @@ -934,11 +951,12 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } } } - offsetComb := offsetCombined[offsetCode] - if offsetComb > 1<<16 { + + if offsetCode >= offsetExtraBitsMinCode { + offsetComb := offsetCombined[offsetCode] //w.writeBits(extraOffset, extraOffsetBits) - bits |= uint64(offset-(offsetComb&0xffff)) << (nbits & 63) - nbits += uint16(offsetComb >> 16) + bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63) + nbits += uint8(offsetComb) if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits @@ -993,8 +1011,6 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { } } - // Fill is rarely better... - const fill = false const numLiterals = endBlockMarker + 1 const numOffsets = 1 @@ -1003,26 +1019,46 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { // Assume header is around 70 bytes: // https://stackoverflow.com/a/25454430 const guessHeaderSizeBits = 70 * 8 - histogram(input, w.literalFreq[:numLiterals], fill) - w.literalFreq[endBlockMarker] = 1 - w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15) - if fill { - // Clear fill... - for i := range w.literalFreq[:numLiterals] { - w.literalFreq[i] = 0 + histogram(input, w.literalFreq[:numLiterals]) + ssize, storable := w.storedSize(input) + if storable && len(input) > 1024 { + // Quick check for incompressible content. + abs := float64(0) + avg := float64(len(input)) / 256 + max := float64(len(input) * 2) + for _, v := range w.literalFreq[:256] { + diff := float64(v) - avg + abs += diff * diff + if abs > max { + break + } + } + if abs < max { + if debugDeflate { + fmt.Println("stored", abs, "<", max) + } + // No chance we can compress this... + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return } - histogram(input, w.literalFreq[:numLiterals], false) } + w.literalFreq[endBlockMarker] = 1 + w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15) estBits := w.tmpLitEncoding.canReuseBits(w.literalFreq[:numLiterals]) - estBits += w.lastHeader - if w.lastHeader == 0 { - estBits += guessHeaderSizeBits + if estBits < math.MaxInt32 { + estBits += w.lastHeader + if w.lastHeader == 0 { + estBits += guessHeaderSizeBits + } + estBits += estBits >> w.logNewTablePenalty } - estBits += estBits >> w.logNewTablePenalty // Store bytes, if we don't get a reasonable improvement. - ssize, storable := w.storedSize(input) if storable && ssize <= estBits { + if debugDeflate { + fmt.Println("stored,", ssize, "<=", estBits) + } w.writeStoredHeader(len(input), eof) w.writeBytes(input) return @@ -1033,7 +1069,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { if estBits < reuseSize { if debugDeflate { - //fmt.Println("not reusing, reuse:", reuseSize/8, "> new:", estBits/8, "- header est:", w.lastHeader/8) + fmt.Println("NOT reusing, reuse:", reuseSize/8, "> new:", estBits/8, "header est:", w.lastHeader/8, "bytes") } // We owe an EOB w.writeCode(w.literalEncoding.codes[endBlockMarker]) @@ -1067,6 +1103,9 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { // Go 1.16 LOVES having these on stack. At least 1.5x the speed. bits, nbits, nbytes := w.bits, w.nbits, w.nbytes + if debugDeflate { + count -= int(nbytes)*8 + int(nbits) + } // Unroll, write 3 codes/loop. // Fastest number of unrolls. for len(input) > 3 { @@ -1076,35 +1115,31 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) bits >>= (n * 8) & 63 nbits -= n * 8 - nbytes += uint8(n) + nbytes += n } if nbytes >= bufferFlushSize { if w.err != nil { nbytes = 0 return } + if debugDeflate { + count += int(nbytes) * 8 + } _, w.err = w.writer.Write(w.bytes[:nbytes]) nbytes = 0 } a, b := encoding[input[0]], encoding[input[1]] - bits |= uint64(a.code) << (nbits & 63) - bits |= uint64(b.code) << ((nbits + a.len) & 63) + bits |= a.code64() << (nbits & 63) + bits |= b.code64() << ((nbits + a.len()) & 63) c := encoding[input[2]] - nbits += b.len + a.len - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len + nbits += b.len() + a.len() + bits |= c.code64() << (nbits & 63) + nbits += c.len() input = input[3:] } // Remaining... for _, t := range input { - // Bitwriting inlined, ~30% speedup - c := encoding[t] - bits |= uint64(c.code) << (nbits & 63) - nbits += c.len - if debugDeflate { - count += int(c.len) - } if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) //*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits @@ -1116,17 +1151,34 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { nbytes = 0 return } + if debugDeflate { + count += int(nbytes) * 8 + } _, w.err = w.writer.Write(w.bytes[:nbytes]) nbytes = 0 } } + // Bitwriting inlined, ~30% speedup + c := encoding[t] + bits |= c.code64() << (nbits & 63) + + nbits += c.len() + if debugDeflate { + count += int(c.len()) + } } // Restore... w.bits, w.nbits, w.nbytes = bits, nbits, nbytes if debugDeflate { - fmt.Println("wrote", count/8, "bytes") + nb := count + int(nbytes)*8 + int(nbits) + fmt.Println("wrote", nb, "bits,", nb/8, "bytes.") + } + // Flush if needed to have space. + if w.nbits >= 48 { + w.writeOutBits() } + if eof || sync { w.writeCode(w.literalEncoding.codes[endBlockMarker]) w.lastHeader = 0 diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go index 67b2b3872843..be7b58b473f9 100644 --- a/vendor/github.com/klauspost/compress/flate/huffman_code.go +++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go @@ -16,8 +16,18 @@ const ( ) // hcode is a huffman code with a bit code and bit length. -type hcode struct { - code, len uint16 +type hcode uint32 + +func (h hcode) len() uint8 { + return uint8(h) +} + +func (h hcode) code64() uint64 { + return uint64(h >> 8) +} + +func (h hcode) zero() bool { + return h == 0 } type huffmanEncoder struct { @@ -56,9 +66,12 @@ type levelInfo struct { } // set sets the code and length of an hcode. -func (h *hcode) set(code uint16, length uint16) { - h.len = length - h.code = code +func (h *hcode) set(code uint16, length uint8) { + *h = hcode(length) | (hcode(code) << 8) +} + +func newhcode(code uint16, length uint8) hcode { + return hcode(length) | (hcode(code) << 8) } func reverseBits(number uint16, bitLength byte) uint16 { @@ -80,7 +93,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder { var ch uint16 for ch = 0; ch < literalCount; ch++ { var bits uint16 - var size uint16 + var size uint8 switch { case ch < 144: // size 8, 000110000 .. 10111111 @@ -99,7 +112,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder { bits = ch + 192 - 280 size = 8 } - codes[ch] = hcode{code: reverseBits(bits, byte(size)), len: size} + codes[ch] = newhcode(reverseBits(bits, size), size) } return h } @@ -108,7 +121,7 @@ func generateFixedOffsetEncoding() *huffmanEncoder { h := newHuffmanEncoder(30) codes := h.codes for ch := range codes { - codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5} + codes[ch] = newhcode(reverseBits(uint16(ch), 5), 5) } return h } @@ -120,7 +133,7 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int { var total int for i, f := range freq { if f != 0 { - total += int(f) * int(h.codes[i].len) + total += int(f) * int(h.codes[i].len()) } } return total @@ -129,9 +142,7 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int { func (h *huffmanEncoder) bitLengthRaw(b []byte) int { var total int for _, f := range b { - if f != 0 { - total += int(h.codes[f].len) - } + total += int(h.codes[f].len()) } return total } @@ -142,10 +153,10 @@ func (h *huffmanEncoder) canReuseBits(freq []uint16) int { for i, f := range freq { if f != 0 { code := h.codes[i] - if code.len == 0 { + if code.zero() { return math.MaxInt32 } - total += int(f) * int(code.len) + total += int(f) * int(code.len()) } } return total @@ -157,13 +168,18 @@ func (h *huffmanEncoder) canReuseBits(freq []uint16) int { // The cases of 0, 1, and 2 literals are handled by special case code. // // list An array of the literals with non-zero frequencies -// and their associated frequencies. The array is in order of increasing -// frequency, and has as its last element a special element with frequency -// MaxInt32 +// +// and their associated frequencies. The array is in order of increasing +// frequency, and has as its last element a special element with frequency +// MaxInt32 +// // maxBits The maximum number of bits that should be used to encode any literal. -// Must be less than 16. +// +// Must be less than 16. +// // return An integer array in which array[i] indicates the number of literals -// that should be encoded in i bits. +// +// that should be encoded in i bits. func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { if maxBits >= maxBitsLimit { panic("flate: maxBits too large") @@ -189,14 +205,19 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { // of the level j ancestor. var leafCounts [maxBitsLimit][maxBitsLimit]int32 + // Descending to only have 1 bounds check. + l2f := int32(list[2].freq) + l1f := int32(list[1].freq) + l0f := int32(list[0].freq) + int32(list[1].freq) + for level := int32(1); level <= maxBits; level++ { // For every level, the first two items are the first two characters. // We initialize the levels as if we had already figured this out. levels[level] = levelInfo{ level: level, - lastFreq: int32(list[1].freq), - nextCharFreq: int32(list[2].freq), - nextPairFreq: int32(list[0].freq) + int32(list[1].freq), + lastFreq: l1f, + nextCharFreq: l2f, + nextPairFreq: l0f, } leafCounts[level][level] = 2 if level == 1 { @@ -207,8 +228,8 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { // We need a total of 2*n - 2 items at top level and have already generated 2. levels[maxBits].needed = 2*n - 4 - level := maxBits - for { + level := uint32(maxBits) + for level < 16 { l := &levels[level] if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 { // We've run out of both leafs and pairs. @@ -240,7 +261,13 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { // more values in the level below l.lastFreq = l.nextPairFreq // Take leaf counts from the lower level, except counts[level] remains the same. - copy(leafCounts[level][:level], leafCounts[level-1][:level]) + if true { + save := leafCounts[level][level] + leafCounts[level] = leafCounts[level-1] + leafCounts[level][level] = save + } else { + copy(leafCounts[level][:level], leafCounts[level-1][:level]) + } levels[l.level-1].needed = 2 } @@ -298,7 +325,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN sortByLiteral(chunk) for _, node := range chunk { - h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)} + h.codes[node.literal] = newhcode(reverseBits(code, uint8(n)), uint8(n)) code++ } list = list[0 : len(list)-int(bits)] @@ -311,6 +338,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN // maxBits The maximum number of bits to use for any literal. func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) { list := h.freqcache[:len(freq)+1] + codes := h.codes[:len(freq)] // Number of non-zero literals count := 0 // Set list to be the set of all non-zero literals and their frequencies @@ -319,11 +347,10 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) { list[count] = literalNode{uint16(i), f} count++ } else { - list[count] = literalNode{} - h.codes[i].len = 0 + codes[i] = 0 } } - list[len(freq)] = literalNode{} + list[count] = literalNode{} list = list[:count] if count <= 2 { @@ -354,21 +381,37 @@ func atLeastOne(v float32) float32 { return v } -// Unassigned values are assigned '1' in the histogram. -func fillHist(b []uint16) { - for i, v := range b { - if v == 0 { - b[i] = 1 +func histogram(b []byte, h []uint16) { + if true && len(b) >= 8<<10 { + // Split for bigger inputs + histogramSplit(b, h) + } else { + h = h[:256] + for _, t := range b { + h[t]++ } } } -func histogram(b []byte, h []uint16, fill bool) { +func histogramSplit(b []byte, h []uint16) { + // Tested, and slightly faster than 2-way. + // Writing to separate arrays and combining is also slightly slower. h = h[:256] - for _, t := range b { - h[t]++ + for len(b)&3 != 0 { + h[b[0]]++ + b = b[1:] } - if fill { - fillHist(h) + n := len(b) / 4 + x, y, z, w := b[:n], b[n:], b[n+n:], b[n+n+n:] + y, z, w = y[:len(x)], z[:len(x)], w[:len(x)] + for i, t := range x { + v0 := &h[t] + v1 := &h[y[i]] + v3 := &h[w[i]] + v2 := &h[z[i]] + *v0++ + *v1++ + *v2++ + *v3++ } } diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go index d5f62f6a2ca4..414c0bea9fa9 100644 --- a/vendor/github.com/klauspost/compress/flate/inflate.go +++ b/vendor/github.com/klauspost/compress/flate/inflate.go @@ -36,6 +36,13 @@ type lengthExtra struct { var decCodeToLen = [32]lengthExtra{{length: 0x0, extra: 0x0}, {length: 0x1, extra: 0x0}, {length: 0x2, extra: 0x0}, {length: 0x3, extra: 0x0}, {length: 0x4, extra: 0x0}, {length: 0x5, extra: 0x0}, {length: 0x6, extra: 0x0}, {length: 0x7, extra: 0x0}, {length: 0x8, extra: 0x1}, {length: 0xa, extra: 0x1}, {length: 0xc, extra: 0x1}, {length: 0xe, extra: 0x1}, {length: 0x10, extra: 0x2}, {length: 0x14, extra: 0x2}, {length: 0x18, extra: 0x2}, {length: 0x1c, extra: 0x2}, {length: 0x20, extra: 0x3}, {length: 0x28, extra: 0x3}, {length: 0x30, extra: 0x3}, {length: 0x38, extra: 0x3}, {length: 0x40, extra: 0x4}, {length: 0x50, extra: 0x4}, {length: 0x60, extra: 0x4}, {length: 0x70, extra: 0x4}, {length: 0x80, extra: 0x5}, {length: 0xa0, extra: 0x5}, {length: 0xc0, extra: 0x5}, {length: 0xe0, extra: 0x5}, {length: 0xff, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}} +var bitMask32 = [32]uint32{ + 0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, + 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, + 0x1ffff, 0x3ffff, 0x7FFFF, 0xfFFFF, 0x1fFFFF, 0x3fFFFF, 0x7fFFFF, 0xffFFFF, + 0x1ffFFFF, 0x3ffFFFF, 0x7ffFFFF, 0xfffFFFF, 0x1fffFFFF, 0x3fffFFFF, 0x7fffFFFF, +} // up to 32 bits + // Initialize the fixedHuffmanDecoder only once upon first use. var fixedOnce sync.Once var fixedHuffmanDecoder huffmanDecoder @@ -559,221 +566,6 @@ func (f *decompressor) readHuffman() error { return nil } -// Decode a single Huffman block from f. -// hl and hd are the Huffman states for the lit/length values -// and the distance values, respectively. If hd == nil, using the -// fixed distance encoding associated with fixed Huffman blocks. -func (f *decompressor) huffmanBlockGeneric() { - const ( - stateInit = iota // Zero value must be stateInit - stateDict - ) - - switch f.stepState { - case stateInit: - goto readLiteral - case stateDict: - goto copyHistory - } - -readLiteral: - // Read literal and/or (length, distance) according to RFC section 3.2.3. - { - var v int - { - // Inlined v, err := f.huffSym(f.hl) - // Since a huffmanDecoder can be empty or be composed of a degenerate tree - // with single element, huffSym must error on these two edge cases. In both - // cases, the chunks slice will be 0 for the invalid sequence, leading it - // satisfy the n == 0 check below. - n := uint(f.hl.maxRead) - // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, - // but is smart enough to keep local variables in registers, so use nb and b, - // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b - for { - for nb < n { - c, err := f.r.ReadByte() - if err != nil { - f.b = b - f.nb = nb - f.err = noEOF(err) - return - } - f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 - } - chunk := f.hl.chunks[b&(huffmanNumChunks-1)] - n = uint(chunk & huffmanCountMask) - if n > huffmanChunkBits { - chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask] - n = uint(chunk & huffmanCountMask) - } - if n <= nb { - if n == 0 { - f.b = b - f.nb = nb - if debugDecode { - fmt.Println("huffsym: n==0") - } - f.err = CorruptInputError(f.roffset) - return - } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n - v = int(chunk >> huffmanValueShift) - break - } - } - } - - var n uint // number of bits extra - var length int - var err error - switch { - case v < 256: - f.dict.writeByte(byte(v)) - if f.dict.availWrite() == 0 { - f.toRead = f.dict.readFlush() - f.step = (*decompressor).huffmanBlockGeneric - f.stepState = stateInit - return - } - goto readLiteral - case v == 256: - f.finishBlock() - return - // otherwise, reference to older data - case v < 265: - length = v - (257 - 3) - n = 0 - case v < 269: - length = v*2 - (265*2 - 11) - n = 1 - case v < 273: - length = v*4 - (269*4 - 19) - n = 2 - case v < 277: - length = v*8 - (273*8 - 35) - n = 3 - case v < 281: - length = v*16 - (277*16 - 67) - n = 4 - case v < 285: - length = v*32 - (281*32 - 131) - n = 5 - case v < maxNumLit: - length = 258 - n = 0 - default: - if debugDecode { - fmt.Println(v, ">= maxNumLit") - } - f.err = CorruptInputError(f.roffset) - return - } - if n > 0 { - for f.nb < n { - if err = f.moreBits(); err != nil { - if debugDecode { - fmt.Println("morebits n>0:", err) - } - f.err = err - return - } - } - length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) - f.b >>= n & regSizeMaskUint32 - f.nb -= n - } - - var dist uint32 - if f.hd == nil { - for f.nb < 5 { - if err = f.moreBits(); err != nil { - if debugDecode { - fmt.Println("morebits f.nb<5:", err) - } - f.err = err - return - } - } - dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) - f.b >>= 5 - f.nb -= 5 - } else { - sym, err := f.huffSym(f.hd) - if err != nil { - if debugDecode { - fmt.Println("huffsym:", err) - } - f.err = err - return - } - dist = uint32(sym) - } - - switch { - case dist < 4: - dist++ - case dist < maxNumDist: - nb := uint(dist-2) >> 1 - // have 1 bit in bottom of dist, need nb more. - extra := (dist & 1) << (nb & regSizeMaskUint32) - for f.nb < nb { - if err = f.moreBits(); err != nil { - if debugDecode { - fmt.Println("morebits f.nb>= nb & regSizeMaskUint32 - f.nb -= nb - dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra - default: - if debugDecode { - fmt.Println("dist too big:", dist, maxNumDist) - } - f.err = CorruptInputError(f.roffset) - return - } - - // No check on length; encoding can be prescient. - if dist > uint32(f.dict.histSize()) { - if debugDecode { - fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) - } - f.err = CorruptInputError(f.roffset) - return - } - - f.copyLen, f.copyDist = length, int(dist) - goto copyHistory - } - -copyHistory: - // Perform a backwards copy according to RFC section 3.2.3. - { - cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen) - if cnt == 0 { - cnt = f.dict.writeCopy(f.copyDist, f.copyLen) - } - f.copyLen -= cnt - - if f.dict.availWrite() == 0 || f.copyLen > 0 { - f.toRead = f.dict.readFlush() - f.step = (*decompressor).huffmanBlockGeneric // We need to continue this work - f.stepState = stateDict - return - } - goto readLiteral - } -} - // Copy a single uncompressed data block from input to output. func (f *decompressor) dataBlock() { // Uncompressed. diff --git a/vendor/github.com/klauspost/compress/flate/inflate_gen.go b/vendor/github.com/klauspost/compress/flate/inflate_gen.go index cc6db27925ce..61342b6b88fb 100644 --- a/vendor/github.com/klauspost/compress/flate/inflate_gen.go +++ b/vendor/github.com/klauspost/compress/flate/inflate_gen.go @@ -21,6 +21,11 @@ func (f *decompressor) huffmanBytesBuffer() { ) fr := f.r.(*bytes.Buffer) + // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, + // but is smart enough to keep local variables in registers, so use nb and b, + // inline call to moreBits and reassign b,nb back to f on return. + fnb, fb, dict := f.nb, f.b, &f.dict + switch f.stepState { case stateInit: goto readLiteral @@ -39,41 +44,35 @@ readLiteral: // cases, the chunks slice will be 0 for the invalid sequence, leading it // satisfy the n == 0 check below. n := uint(f.hl.maxRead) - // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, - // but is smart enough to keep local variables in registers, so use nb and b, - // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hl.chunks[b&(huffmanNumChunks-1)] + chunk := f.hl.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask] + chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n v = int(chunk >> huffmanValueShift) break } @@ -83,15 +82,17 @@ readLiteral: var length int switch { case v < 256: - f.dict.writeByte(byte(v)) - if f.dict.availWrite() == 0 { - f.toRead = f.dict.readFlush() + dict.writeByte(byte(v)) + if dict.availWrite() == 0 { + f.toRead = dict.readFlush() f.step = (*decompressor).huffmanBytesBuffer f.stepState = stateInit + f.b, f.nb = fb, fnb return } goto readLiteral case v == 256: + f.b, f.nb = fb, fnb f.finishBlock() return // otherwise, reference to older data @@ -101,9 +102,10 @@ readLiteral: val := decCodeToLen[(v - 257)] length = int(val.length) + 3 n := uint(val.extra) - for f.nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits n>0:", err) } @@ -111,25 +113,27 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) - f.b >>= n & regSizeMaskUint32 - f.nb -= n + length += int(fb & bitMask32[n]) + fb >>= n & regSizeMaskUint32 + fnb -= n default: if debugDecode { fmt.Println(v, ">= maxNumLit") } f.err = CorruptInputError(f.roffset) + f.b, f.nb = fb, fnb return } var dist uint32 if f.hd == nil { - for f.nb < 5 { + for fnb < 5 { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb<5:", err) } @@ -137,12 +141,12 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) - f.b >>= 5 - f.nb -= 5 + dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3))) + fb >>= 5 + fnb -= 5 } else { // Since a huffmanDecoder can be empty or be composed of a degenerate tree // with single element, huffSym must error on these two edge cases. In both @@ -152,38 +156,35 @@ readLiteral: // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, // but is smart enough to keep local variables in registers, so use nb and b, // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hd.chunks[b&(huffmanNumChunks-1)] + chunk := f.hd.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask] + chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n dist = uint32(chunk >> huffmanValueShift) break } @@ -197,9 +198,10 @@ readLiteral: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. extra := (dist & 1) << (nb & regSizeMaskUint32) - for f.nb < nb { + for fnb < nb { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb>= nb & regSizeMaskUint32 - f.nb -= nb + extra |= fb & bitMask32[nb] + fb >>= nb & regSizeMaskUint32 + fnb -= nb dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra + // slower: dist = bitMask32[nb+1] + 2 + extra default: + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) } @@ -223,9 +227,10 @@ readLiteral: } // No check on length; encoding can be prescient. - if dist > uint32(f.dict.histSize()) { + if dist > uint32(dict.histSize()) { + f.b, f.nb = fb, fnb if debugDecode { - fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) + fmt.Println("dist > dict.histSize():", dist, dict.histSize()) } f.err = CorruptInputError(f.roffset) return @@ -238,20 +243,22 @@ readLiteral: copyHistory: // Perform a backwards copy according to RFC section 3.2.3. { - cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen) + cnt := dict.tryWriteCopy(f.copyDist, f.copyLen) if cnt == 0 { - cnt = f.dict.writeCopy(f.copyDist, f.copyLen) + cnt = dict.writeCopy(f.copyDist, f.copyLen) } f.copyLen -= cnt - if f.dict.availWrite() == 0 || f.copyLen > 0 { - f.toRead = f.dict.readFlush() + if dict.availWrite() == 0 || f.copyLen > 0 { + f.toRead = dict.readFlush() f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work f.stepState = stateDict + f.b, f.nb = fb, fnb return } goto readLiteral } + // Not reached } // Decode a single Huffman block from f. @@ -265,6 +272,11 @@ func (f *decompressor) huffmanBytesReader() { ) fr := f.r.(*bytes.Reader) + // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, + // but is smart enough to keep local variables in registers, so use nb and b, + // inline call to moreBits and reassign b,nb back to f on return. + fnb, fb, dict := f.nb, f.b, &f.dict + switch f.stepState { case stateInit: goto readLiteral @@ -283,41 +295,35 @@ readLiteral: // cases, the chunks slice will be 0 for the invalid sequence, leading it // satisfy the n == 0 check below. n := uint(f.hl.maxRead) - // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, - // but is smart enough to keep local variables in registers, so use nb and b, - // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hl.chunks[b&(huffmanNumChunks-1)] + chunk := f.hl.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask] + chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n v = int(chunk >> huffmanValueShift) break } @@ -327,15 +333,17 @@ readLiteral: var length int switch { case v < 256: - f.dict.writeByte(byte(v)) - if f.dict.availWrite() == 0 { - f.toRead = f.dict.readFlush() + dict.writeByte(byte(v)) + if dict.availWrite() == 0 { + f.toRead = dict.readFlush() f.step = (*decompressor).huffmanBytesReader f.stepState = stateInit + f.b, f.nb = fb, fnb return } goto readLiteral case v == 256: + f.b, f.nb = fb, fnb f.finishBlock() return // otherwise, reference to older data @@ -345,9 +353,10 @@ readLiteral: val := decCodeToLen[(v - 257)] length = int(val.length) + 3 n := uint(val.extra) - for f.nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits n>0:", err) } @@ -355,25 +364,27 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) - f.b >>= n & regSizeMaskUint32 - f.nb -= n + length += int(fb & bitMask32[n]) + fb >>= n & regSizeMaskUint32 + fnb -= n default: if debugDecode { fmt.Println(v, ">= maxNumLit") } f.err = CorruptInputError(f.roffset) + f.b, f.nb = fb, fnb return } var dist uint32 if f.hd == nil { - for f.nb < 5 { + for fnb < 5 { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb<5:", err) } @@ -381,12 +392,12 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) - f.b >>= 5 - f.nb -= 5 + dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3))) + fb >>= 5 + fnb -= 5 } else { // Since a huffmanDecoder can be empty or be composed of a degenerate tree // with single element, huffSym must error on these two edge cases. In both @@ -396,38 +407,35 @@ readLiteral: // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, // but is smart enough to keep local variables in registers, so use nb and b, // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hd.chunks[b&(huffmanNumChunks-1)] + chunk := f.hd.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask] + chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n dist = uint32(chunk >> huffmanValueShift) break } @@ -441,9 +449,10 @@ readLiteral: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. extra := (dist & 1) << (nb & regSizeMaskUint32) - for f.nb < nb { + for fnb < nb { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb>= nb & regSizeMaskUint32 - f.nb -= nb + extra |= fb & bitMask32[nb] + fb >>= nb & regSizeMaskUint32 + fnb -= nb dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra + // slower: dist = bitMask32[nb+1] + 2 + extra default: + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) } @@ -467,9 +478,10 @@ readLiteral: } // No check on length; encoding can be prescient. - if dist > uint32(f.dict.histSize()) { + if dist > uint32(dict.histSize()) { + f.b, f.nb = fb, fnb if debugDecode { - fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) + fmt.Println("dist > dict.histSize():", dist, dict.histSize()) } f.err = CorruptInputError(f.roffset) return @@ -482,20 +494,22 @@ readLiteral: copyHistory: // Perform a backwards copy according to RFC section 3.2.3. { - cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen) + cnt := dict.tryWriteCopy(f.copyDist, f.copyLen) if cnt == 0 { - cnt = f.dict.writeCopy(f.copyDist, f.copyLen) + cnt = dict.writeCopy(f.copyDist, f.copyLen) } f.copyLen -= cnt - if f.dict.availWrite() == 0 || f.copyLen > 0 { - f.toRead = f.dict.readFlush() + if dict.availWrite() == 0 || f.copyLen > 0 { + f.toRead = dict.readFlush() f.step = (*decompressor).huffmanBytesReader // We need to continue this work f.stepState = stateDict + f.b, f.nb = fb, fnb return } goto readLiteral } + // Not reached } // Decode a single Huffman block from f. @@ -509,6 +523,11 @@ func (f *decompressor) huffmanBufioReader() { ) fr := f.r.(*bufio.Reader) + // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, + // but is smart enough to keep local variables in registers, so use nb and b, + // inline call to moreBits and reassign b,nb back to f on return. + fnb, fb, dict := f.nb, f.b, &f.dict + switch f.stepState { case stateInit: goto readLiteral @@ -527,41 +546,35 @@ readLiteral: // cases, the chunks slice will be 0 for the invalid sequence, leading it // satisfy the n == 0 check below. n := uint(f.hl.maxRead) - // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, - // but is smart enough to keep local variables in registers, so use nb and b, - // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hl.chunks[b&(huffmanNumChunks-1)] + chunk := f.hl.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask] + chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n v = int(chunk >> huffmanValueShift) break } @@ -571,15 +584,17 @@ readLiteral: var length int switch { case v < 256: - f.dict.writeByte(byte(v)) - if f.dict.availWrite() == 0 { - f.toRead = f.dict.readFlush() + dict.writeByte(byte(v)) + if dict.availWrite() == 0 { + f.toRead = dict.readFlush() f.step = (*decompressor).huffmanBufioReader f.stepState = stateInit + f.b, f.nb = fb, fnb return } goto readLiteral case v == 256: + f.b, f.nb = fb, fnb f.finishBlock() return // otherwise, reference to older data @@ -589,9 +604,10 @@ readLiteral: val := decCodeToLen[(v - 257)] length = int(val.length) + 3 n := uint(val.extra) - for f.nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits n>0:", err) } @@ -599,25 +615,27 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) - f.b >>= n & regSizeMaskUint32 - f.nb -= n + length += int(fb & bitMask32[n]) + fb >>= n & regSizeMaskUint32 + fnb -= n default: if debugDecode { fmt.Println(v, ">= maxNumLit") } f.err = CorruptInputError(f.roffset) + f.b, f.nb = fb, fnb return } var dist uint32 if f.hd == nil { - for f.nb < 5 { + for fnb < 5 { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb<5:", err) } @@ -625,12 +643,12 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) - f.b >>= 5 - f.nb -= 5 + dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3))) + fb >>= 5 + fnb -= 5 } else { // Since a huffmanDecoder can be empty or be composed of a degenerate tree // with single element, huffSym must error on these two edge cases. In both @@ -640,38 +658,35 @@ readLiteral: // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, // but is smart enough to keep local variables in registers, so use nb and b, // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hd.chunks[b&(huffmanNumChunks-1)] + chunk := f.hd.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask] + chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n dist = uint32(chunk >> huffmanValueShift) break } @@ -685,9 +700,10 @@ readLiteral: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. extra := (dist & 1) << (nb & regSizeMaskUint32) - for f.nb < nb { + for fnb < nb { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb>= nb & regSizeMaskUint32 - f.nb -= nb + extra |= fb & bitMask32[nb] + fb >>= nb & regSizeMaskUint32 + fnb -= nb dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra + // slower: dist = bitMask32[nb+1] + 2 + extra default: + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) } @@ -711,9 +729,10 @@ readLiteral: } // No check on length; encoding can be prescient. - if dist > uint32(f.dict.histSize()) { + if dist > uint32(dict.histSize()) { + f.b, f.nb = fb, fnb if debugDecode { - fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) + fmt.Println("dist > dict.histSize():", dist, dict.histSize()) } f.err = CorruptInputError(f.roffset) return @@ -726,20 +745,22 @@ readLiteral: copyHistory: // Perform a backwards copy according to RFC section 3.2.3. { - cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen) + cnt := dict.tryWriteCopy(f.copyDist, f.copyLen) if cnt == 0 { - cnt = f.dict.writeCopy(f.copyDist, f.copyLen) + cnt = dict.writeCopy(f.copyDist, f.copyLen) } f.copyLen -= cnt - if f.dict.availWrite() == 0 || f.copyLen > 0 { - f.toRead = f.dict.readFlush() + if dict.availWrite() == 0 || f.copyLen > 0 { + f.toRead = dict.readFlush() f.step = (*decompressor).huffmanBufioReader // We need to continue this work f.stepState = stateDict + f.b, f.nb = fb, fnb return } goto readLiteral } + // Not reached } // Decode a single Huffman block from f. @@ -753,6 +774,11 @@ func (f *decompressor) huffmanStringsReader() { ) fr := f.r.(*strings.Reader) + // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, + // but is smart enough to keep local variables in registers, so use nb and b, + // inline call to moreBits and reassign b,nb back to f on return. + fnb, fb, dict := f.nb, f.b, &f.dict + switch f.stepState { case stateInit: goto readLiteral @@ -771,41 +797,286 @@ readLiteral: // cases, the chunks slice will be 0 for the invalid sequence, leading it // satisfy the n == 0 check below. n := uint(f.hl.maxRead) + for { + for fnb < n { + c, err := fr.ReadByte() + if err != nil { + f.b, f.nb = fb, fnb + f.err = noEOF(err) + return + } + f.roffset++ + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 + } + chunk := f.hl.chunks[fb&(huffmanNumChunks-1)] + n = uint(chunk & huffmanCountMask) + if n > huffmanChunkBits { + chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask] + n = uint(chunk & huffmanCountMask) + } + if n <= fnb { + if n == 0 { + f.b, f.nb = fb, fnb + if debugDecode { + fmt.Println("huffsym: n==0") + } + f.err = CorruptInputError(f.roffset) + return + } + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n + v = int(chunk >> huffmanValueShift) + break + } + } + } + + var length int + switch { + case v < 256: + dict.writeByte(byte(v)) + if dict.availWrite() == 0 { + f.toRead = dict.readFlush() + f.step = (*decompressor).huffmanStringsReader + f.stepState = stateInit + f.b, f.nb = fb, fnb + return + } + goto readLiteral + case v == 256: + f.b, f.nb = fb, fnb + f.finishBlock() + return + // otherwise, reference to older data + case v < 265: + length = v - (257 - 3) + case v < maxNumLit: + val := decCodeToLen[(v - 257)] + length = int(val.length) + 3 + n := uint(val.extra) + for fnb < n { + c, err := fr.ReadByte() + if err != nil { + f.b, f.nb = fb, fnb + if debugDecode { + fmt.Println("morebits n>0:", err) + } + f.err = err + return + } + f.roffset++ + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 + } + length += int(fb & bitMask32[n]) + fb >>= n & regSizeMaskUint32 + fnb -= n + default: + if debugDecode { + fmt.Println(v, ">= maxNumLit") + } + f.err = CorruptInputError(f.roffset) + f.b, f.nb = fb, fnb + return + } + + var dist uint32 + if f.hd == nil { + for fnb < 5 { + c, err := fr.ReadByte() + if err != nil { + f.b, f.nb = fb, fnb + if debugDecode { + fmt.Println("morebits f.nb<5:", err) + } + f.err = err + return + } + f.roffset++ + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 + } + dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3))) + fb >>= 5 + fnb -= 5 + } else { + // Since a huffmanDecoder can be empty or be composed of a degenerate tree + // with single element, huffSym must error on these two edge cases. In both + // cases, the chunks slice will be 0 for the invalid sequence, leading it + // satisfy the n == 0 check below. + n := uint(f.hd.maxRead) // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, // but is smart enough to keep local variables in registers, so use nb and b, // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hl.chunks[b&(huffmanNumChunks-1)] + chunk := f.hd.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask] + chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n + dist = uint32(chunk >> huffmanValueShift) + break + } + } + } + + switch { + case dist < 4: + dist++ + case dist < maxNumDist: + nb := uint(dist-2) >> 1 + // have 1 bit in bottom of dist, need nb more. + extra := (dist & 1) << (nb & regSizeMaskUint32) + for fnb < nb { + c, err := fr.ReadByte() + if err != nil { + f.b, f.nb = fb, fnb + if debugDecode { + fmt.Println("morebits f.nb>= nb & regSizeMaskUint32 + fnb -= nb + dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra + // slower: dist = bitMask32[nb+1] + 2 + extra + default: + f.b, f.nb = fb, fnb + if debugDecode { + fmt.Println("dist too big:", dist, maxNumDist) + } + f.err = CorruptInputError(f.roffset) + return + } + + // No check on length; encoding can be prescient. + if dist > uint32(dict.histSize()) { + f.b, f.nb = fb, fnb + if debugDecode { + fmt.Println("dist > dict.histSize():", dist, dict.histSize()) + } + f.err = CorruptInputError(f.roffset) + return + } + + f.copyLen, f.copyDist = length, int(dist) + goto copyHistory + } + +copyHistory: + // Perform a backwards copy according to RFC section 3.2.3. + { + cnt := dict.tryWriteCopy(f.copyDist, f.copyLen) + if cnt == 0 { + cnt = dict.writeCopy(f.copyDist, f.copyLen) + } + f.copyLen -= cnt + + if dict.availWrite() == 0 || f.copyLen > 0 { + f.toRead = dict.readFlush() + f.step = (*decompressor).huffmanStringsReader // We need to continue this work + f.stepState = stateDict + f.b, f.nb = fb, fnb + return + } + goto readLiteral + } + // Not reached +} + +// Decode a single Huffman block from f. +// hl and hd are the Huffman states for the lit/length values +// and the distance values, respectively. If hd == nil, using the +// fixed distance encoding associated with fixed Huffman blocks. +func (f *decompressor) huffmanGenericReader() { + const ( + stateInit = iota // Zero value must be stateInit + stateDict + ) + fr := f.r.(Reader) + + // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, + // but is smart enough to keep local variables in registers, so use nb and b, + // inline call to moreBits and reassign b,nb back to f on return. + fnb, fb, dict := f.nb, f.b, &f.dict + + switch f.stepState { + case stateInit: + goto readLiteral + case stateDict: + goto copyHistory + } + +readLiteral: + // Read literal and/or (length, distance) according to RFC section 3.2.3. + { + var v int + { + // Inlined v, err := f.huffSym(f.hl) + // Since a huffmanDecoder can be empty or be composed of a degenerate tree + // with single element, huffSym must error on these two edge cases. In both + // cases, the chunks slice will be 0 for the invalid sequence, leading it + // satisfy the n == 0 check below. + n := uint(f.hl.maxRead) + for { + for fnb < n { + c, err := fr.ReadByte() + if err != nil { + f.b, f.nb = fb, fnb + f.err = noEOF(err) + return + } + f.roffset++ + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 + } + chunk := f.hl.chunks[fb&(huffmanNumChunks-1)] + n = uint(chunk & huffmanCountMask) + if n > huffmanChunkBits { + chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask] + n = uint(chunk & huffmanCountMask) + } + if n <= fnb { + if n == 0 { + f.b, f.nb = fb, fnb + if debugDecode { + fmt.Println("huffsym: n==0") + } + f.err = CorruptInputError(f.roffset) + return + } + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n v = int(chunk >> huffmanValueShift) break } @@ -815,15 +1086,17 @@ readLiteral: var length int switch { case v < 256: - f.dict.writeByte(byte(v)) - if f.dict.availWrite() == 0 { - f.toRead = f.dict.readFlush() - f.step = (*decompressor).huffmanStringsReader + dict.writeByte(byte(v)) + if dict.availWrite() == 0 { + f.toRead = dict.readFlush() + f.step = (*decompressor).huffmanGenericReader f.stepState = stateInit + f.b, f.nb = fb, fnb return } goto readLiteral case v == 256: + f.b, f.nb = fb, fnb f.finishBlock() return // otherwise, reference to older data @@ -833,9 +1106,10 @@ readLiteral: val := decCodeToLen[(v - 257)] length = int(val.length) + 3 n := uint(val.extra) - for f.nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits n>0:", err) } @@ -843,25 +1117,27 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) - f.b >>= n & regSizeMaskUint32 - f.nb -= n + length += int(fb & bitMask32[n]) + fb >>= n & regSizeMaskUint32 + fnb -= n default: if debugDecode { fmt.Println(v, ">= maxNumLit") } f.err = CorruptInputError(f.roffset) + f.b, f.nb = fb, fnb return } var dist uint32 if f.hd == nil { - for f.nb < 5 { + for fnb < 5 { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb<5:", err) } @@ -869,12 +1145,12 @@ readLiteral: return } f.roffset++ - f.b |= uint32(c) << f.nb - f.nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) - f.b >>= 5 - f.nb -= 5 + dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3))) + fb >>= 5 + fnb -= 5 } else { // Since a huffmanDecoder can be empty or be composed of a degenerate tree // with single element, huffSym must error on these two edge cases. In both @@ -884,38 +1160,35 @@ readLiteral: // Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers, // but is smart enough to keep local variables in registers, so use nb and b, // inline call to moreBits and reassign b,nb back to f on return. - nb, b := f.nb, f.b for { - for nb < n { + for fnb < n { c, err := fr.ReadByte() if err != nil { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb f.err = noEOF(err) return } f.roffset++ - b |= uint32(c) << (nb & regSizeMaskUint32) - nb += 8 + fb |= uint32(c) << (fnb & regSizeMaskUint32) + fnb += 8 } - chunk := f.hd.chunks[b&(huffmanNumChunks-1)] + chunk := f.hd.chunks[fb&(huffmanNumChunks-1)] n = uint(chunk & huffmanCountMask) if n > huffmanChunkBits { - chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask] + chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask] n = uint(chunk & huffmanCountMask) } - if n <= nb { + if n <= fnb { if n == 0 { - f.b = b - f.nb = nb + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("huffsym: n==0") } f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & regSizeMaskUint32) - f.nb = nb - n + fb = fb >> (n & regSizeMaskUint32) + fnb = fnb - n dist = uint32(chunk >> huffmanValueShift) break } @@ -929,9 +1202,10 @@ readLiteral: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. extra := (dist & 1) << (nb & regSizeMaskUint32) - for f.nb < nb { + for fnb < nb { c, err := fr.ReadByte() if err != nil { + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("morebits f.nb>= nb & regSizeMaskUint32 - f.nb -= nb + extra |= fb & bitMask32[nb] + fb >>= nb & regSizeMaskUint32 + fnb -= nb dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra + // slower: dist = bitMask32[nb+1] + 2 + extra default: + f.b, f.nb = fb, fnb if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) } @@ -955,9 +1231,10 @@ readLiteral: } // No check on length; encoding can be prescient. - if dist > uint32(f.dict.histSize()) { + if dist > uint32(dict.histSize()) { + f.b, f.nb = fb, fnb if debugDecode { - fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) + fmt.Println("dist > dict.histSize():", dist, dict.histSize()) } f.err = CorruptInputError(f.roffset) return @@ -970,20 +1247,22 @@ readLiteral: copyHistory: // Perform a backwards copy according to RFC section 3.2.3. { - cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen) + cnt := dict.tryWriteCopy(f.copyDist, f.copyLen) if cnt == 0 { - cnt = f.dict.writeCopy(f.copyDist, f.copyLen) + cnt = dict.writeCopy(f.copyDist, f.copyLen) } f.copyLen -= cnt - if f.dict.availWrite() == 0 || f.copyLen > 0 { - f.toRead = f.dict.readFlush() - f.step = (*decompressor).huffmanStringsReader // We need to continue this work + if dict.availWrite() == 0 || f.copyLen > 0 { + f.toRead = dict.readFlush() + f.step = (*decompressor).huffmanGenericReader // We need to continue this work f.stepState = stateDict + f.b, f.nb = fb, fnb return } goto readLiteral } + // Not reached } func (f *decompressor) huffmanBlockDecoder() func() { @@ -996,7 +1275,9 @@ func (f *decompressor) huffmanBlockDecoder() func() { return f.huffmanBufioReader case *strings.Reader: return f.huffmanStringsReader + case Reader: + return f.huffmanGenericReader default: - return f.huffmanBlockGeneric + return f.huffmanGenericReader } } diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go index 1e5eea3968aa..703b9a89aa39 100644 --- a/vendor/github.com/klauspost/compress/flate/level1.go +++ b/vendor/github.com/klauspost/compress/flate/level1.go @@ -1,6 +1,10 @@ package flate -import "fmt" +import ( + "encoding/binary" + "fmt" + "math/bits" +) // fastGen maintains the table for matches, // and the previous byte block for level 2. @@ -15,6 +19,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { const ( inputMargin = 12 - 1 minNonLiteralBlockSize = 1 + 1 + inputMargin + hashBytes = 5 ) if debugDeflate && e.cur < 0 { panic(fmt.Sprint("e.cur < 0: ", e.cur)) @@ -64,7 +69,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { sLimit := int32(len(src) - inputMargin) // nextEmit is where in src the next emitLiteral should start from. - cv := load3232(src, s) + cv := load6432(src, s) for { const skipLog = 5 @@ -73,7 +78,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { nextS := s var candidate tableEntry for { - nextHash := hash(cv) + nextHash := hashLen(cv, tableBits, hashBytes) candidate = e.table[nextHash] nextS = s + doEvery + (s-nextEmit)>>skipLog if nextS > sLimit { @@ -82,16 +87,16 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { now := load6432(src, nextS) e.table[nextHash] = tableEntry{offset: s + e.cur} - nextHash = hash(uint32(now)) + nextHash = hashLen(now, tableBits, hashBytes) offset := s - (candidate.offset - e.cur) - if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) { + if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) { e.table[nextHash] = tableEntry{offset: nextS + e.cur} break } // Do one right away... - cv = uint32(now) + cv = now s = nextS nextS++ candidate = e.table[nextHash] @@ -99,11 +104,11 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { e.table[nextHash] = tableEntry{offset: s + e.cur} offset = s - (candidate.offset - e.cur) - if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) { + if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) { e.table[nextHash] = tableEntry{offset: nextS + e.cur} break } - cv = uint32(now) + cv = now s = nextS } @@ -116,7 +121,32 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { // Extend the 4-byte match as long as possible. t := candidate.offset - e.cur - l := e.matchlenLong(s+4, t+4, src) + 4 + var l = int32(4) + if false { + l = e.matchlenLong(s+4, t+4, src) + 4 + } else { + // inlined: + a := src[s+4:] + b := src[t+4:] + for len(a) >= 8 { + if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 { + l += int32(bits.TrailingZeros64(diff) >> 3) + break + } + l += 8 + a = a[8:] + b = b[8:] + } + if len(a) < 8 { + b = b[:len(a)] + for i := range a { + if a[i] != b[i] { + break + } + l++ + } + } + } // Extend backwards for t > 0 && s > nextEmit && src[t-1] == src[s-1] { @@ -125,11 +155,43 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { l++ } if nextEmit < s { - emitLiteral(dst, src[nextEmit:s]) + if false { + emitLiteral(dst, src[nextEmit:s]) + } else { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } } // Save the match found - dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + if false { + dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) + } else { + // Inlined... + xoffset := uint32(s - t - baseMatchOffset) + xlength := l + oc := offsetCode(xoffset) + xoffset |= oc << 16 + for xlength > 0 { + xl := xlength + if xl > 258 { + if xl > 258+baseMatchLength { + xl = 258 + } else { + xl = 258 - baseMatchLength + } + } + xlength -= xl + xl -= baseMatchLength + dst.extraHist[lengthCodes1[uint8(xl)]]++ + dst.offHist[oc]++ + dst.tokens[dst.n] = token(matchType | uint32(xl)<= s { @@ -137,9 +199,9 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { } if s >= sLimit { // Index first pair after match end. - if int(s+l+4) < len(src) { - cv := load3232(src, s) - e.table[hash(cv)] = tableEntry{offset: s + e.cur} + if int(s+l+8) < len(src) { + cv := load6432(src, s) + e.table[hashLen(cv, tableBits, hashBytes)] = tableEntry{offset: s + e.cur} } goto emitRemainder } @@ -152,16 +214,16 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { // three load32 calls. x := load6432(src, s-2) o := e.cur + s - 2 - prevHash := hash(uint32(x)) + prevHash := hashLen(x, tableBits, hashBytes) e.table[prevHash] = tableEntry{offset: o} x >>= 16 - currHash := hash(uint32(x)) + currHash := hashLen(x, tableBits, hashBytes) candidate = e.table[currHash] e.table[currHash] = tableEntry{offset: o + 2} offset := s - (candidate.offset - e.cur) if offset > maxMatchOffset || uint32(x) != load3232(src, candidate.offset-e.cur) { - cv = uint32(x >> 8) + cv = x >> 8 s++ break } diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go index 234c4389ab39..876dfbe30543 100644 --- a/vendor/github.com/klauspost/compress/flate/level2.go +++ b/vendor/github.com/klauspost/compress/flate/level2.go @@ -16,6 +16,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { const ( inputMargin = 12 - 1 minNonLiteralBlockSize = 1 + 1 + inputMargin + hashBytes = 5 ) if debugDeflate && e.cur < 0 { @@ -66,7 +67,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { sLimit := int32(len(src) - inputMargin) // nextEmit is where in src the next emitLiteral should start from. - cv := load3232(src, s) + cv := load6432(src, s) for { // When should we start skipping if we haven't found matches in a long while. const skipLog = 5 @@ -75,7 +76,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { nextS := s var candidate tableEntry for { - nextHash := hash4u(cv, bTableBits) + nextHash := hashLen(cv, bTableBits, hashBytes) s = nextS nextS = s + doEvery + (s-nextEmit)>>skipLog if nextS > sLimit { @@ -84,16 +85,16 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { candidate = e.table[nextHash] now := load6432(src, nextS) e.table[nextHash] = tableEntry{offset: s + e.cur} - nextHash = hash4u(uint32(now), bTableBits) + nextHash = hashLen(now, bTableBits, hashBytes) offset := s - (candidate.offset - e.cur) - if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) { + if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) { e.table[nextHash] = tableEntry{offset: nextS + e.cur} break } // Do one right away... - cv = uint32(now) + cv = now s = nextS nextS++ candidate = e.table[nextHash] @@ -101,10 +102,10 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { e.table[nextHash] = tableEntry{offset: s + e.cur} offset = s - (candidate.offset - e.cur) - if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) { + if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) { break } - cv = uint32(now) + cv = now } // A 4-byte match has been found. We'll later see if more than 4 bytes @@ -134,7 +135,15 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { l++ } if nextEmit < s { - emitLiteral(dst, src[nextEmit:s]) + if false { + emitLiteral(dst, src[nextEmit:s]) + } else { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } } dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) @@ -146,9 +155,9 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { if s >= sLimit { // Index first pair after match end. - if int(s+l+4) < len(src) { - cv := load3232(src, s) - e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur} + if int(s+l+8) < len(src) { + cv := load6432(src, s) + e.table[hashLen(cv, bTableBits, hashBytes)] = tableEntry{offset: s + e.cur} } goto emitRemainder } @@ -156,15 +165,15 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { // Store every second hash in-between, but offset by 1. for i := s - l + 2; i < s-5; i += 7 { x := load6432(src, i) - nextHash := hash4u(uint32(x), bTableBits) + nextHash := hashLen(x, bTableBits, hashBytes) e.table[nextHash] = tableEntry{offset: e.cur + i} // Skip one x >>= 16 - nextHash = hash4u(uint32(x), bTableBits) + nextHash = hashLen(x, bTableBits, hashBytes) e.table[nextHash] = tableEntry{offset: e.cur + i + 2} // Skip one x >>= 16 - nextHash = hash4u(uint32(x), bTableBits) + nextHash = hashLen(x, bTableBits, hashBytes) e.table[nextHash] = tableEntry{offset: e.cur + i + 4} } @@ -176,17 +185,17 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { // three load32 calls. x := load6432(src, s-2) o := e.cur + s - 2 - prevHash := hash4u(uint32(x), bTableBits) - prevHash2 := hash4u(uint32(x>>8), bTableBits) + prevHash := hashLen(x, bTableBits, hashBytes) + prevHash2 := hashLen(x>>8, bTableBits, hashBytes) e.table[prevHash] = tableEntry{offset: o} e.table[prevHash2] = tableEntry{offset: o + 1} - currHash := hash4u(uint32(x>>16), bTableBits) + currHash := hashLen(x>>16, bTableBits, hashBytes) candidate = e.table[currHash] e.table[currHash] = tableEntry{offset: o + 2} offset := s - (candidate.offset - e.cur) if offset > maxMatchOffset || uint32(x>>16) != load3232(src, candidate.offset-e.cur) { - cv = uint32(x >> 24) + cv = x >> 24 s++ break } diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go index c22b4244a5c0..7aa2b72a129f 100644 --- a/vendor/github.com/klauspost/compress/flate/level3.go +++ b/vendor/github.com/klauspost/compress/flate/level3.go @@ -5,14 +5,17 @@ import "fmt" // fastEncL3 type fastEncL3 struct { fastGen - table [tableSize]tableEntryPrev + table [1 << 16]tableEntryPrev } // Encode uses a similar algorithm to level 2, will check up to two candidates. func (e *fastEncL3) Encode(dst *tokens, src []byte) { const ( - inputMargin = 8 - 1 + inputMargin = 12 - 1 minNonLiteralBlockSize = 1 + 1 + inputMargin + tableBits = 16 + tableSize = 1 << tableBits + hashBytes = 5 ) if debugDeflate && e.cur < 0 { @@ -67,20 +70,20 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { sLimit := int32(len(src) - inputMargin) // nextEmit is where in src the next emitLiteral should start from. - cv := load3232(src, s) + cv := load6432(src, s) for { - const skipLog = 6 + const skipLog = 7 nextS := s var candidate tableEntry for { - nextHash := hash(cv) + nextHash := hashLen(cv, tableBits, hashBytes) s = nextS nextS = s + 1 + (s-nextEmit)>>skipLog if nextS > sLimit { goto emitRemainder } candidates := e.table[nextHash] - now := load3232(src, nextS) + now := load6432(src, nextS) // Safe offset distance until s + 4... minOffset := e.cur + s - (maxMatchOffset - 4) @@ -94,8 +97,8 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { continue } - if cv == load3232(src, candidate.offset-e.cur) { - if candidates.Prev.offset < minOffset || cv != load3232(src, candidates.Prev.offset-e.cur) { + if uint32(cv) == load3232(src, candidate.offset-e.cur) { + if candidates.Prev.offset < minOffset || uint32(cv) != load3232(src, candidates.Prev.offset-e.cur) { break } // Both match and are valid, pick longest. @@ -110,7 +113,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { // We only check if value mismatches. // Offset will always be invalid in other cases. candidate = candidates.Prev - if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) { + if candidate.offset > minOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) { break } } @@ -141,7 +144,15 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { l++ } if nextEmit < s { - emitLiteral(dst, src[nextEmit:s]) + if false { + emitLiteral(dst, src[nextEmit:s]) + } else { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } } dst.AddMatchLong(l, uint32(s-t-baseMatchOffset)) @@ -154,9 +165,9 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { if s >= sLimit { t += l // Index first pair after match end. - if int(t+4) < len(src) && t > 0 { - cv := load3232(src, t) - nextHash := hash(cv) + if int(t+8) < len(src) && t > 0 { + cv = load6432(src, t) + nextHash := hashLen(cv, tableBits, hashBytes) e.table[nextHash] = tableEntryPrev{ Prev: e.table[nextHash].Cur, Cur: tableEntry{offset: e.cur + t}, @@ -165,32 +176,33 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { goto emitRemainder } - // We could immediately start working at s now, but to improve - // compression we first update the hash table at s-3 to s. - x := load6432(src, s-3) - prevHash := hash(uint32(x)) - e.table[prevHash] = tableEntryPrev{ - Prev: e.table[prevHash].Cur, - Cur: tableEntry{offset: e.cur + s - 3}, + // Store every 5th hash in-between. + for i := s - l + 2; i < s-5; i += 6 { + nextHash := hashLen(load6432(src, i), tableBits, hashBytes) + e.table[nextHash] = tableEntryPrev{ + Prev: e.table[nextHash].Cur, + Cur: tableEntry{offset: e.cur + i}} } - x >>= 8 - prevHash = hash(uint32(x)) + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-2 to s. + x := load6432(src, s-2) + prevHash := hashLen(x, tableBits, hashBytes) e.table[prevHash] = tableEntryPrev{ Prev: e.table[prevHash].Cur, Cur: tableEntry{offset: e.cur + s - 2}, } x >>= 8 - prevHash = hash(uint32(x)) + prevHash = hashLen(x, tableBits, hashBytes) e.table[prevHash] = tableEntryPrev{ Prev: e.table[prevHash].Cur, Cur: tableEntry{offset: e.cur + s - 1}, } x >>= 8 - currHash := hash(uint32(x)) + currHash := hashLen(x, tableBits, hashBytes) candidates := e.table[currHash] - cv = uint32(x) + cv = x e.table[currHash] = tableEntryPrev{ Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur}, @@ -200,18 +212,18 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { candidate = candidates.Cur minOffset := e.cur + s - (maxMatchOffset - 4) - if candidate.offset > minOffset && cv != load3232(src, candidate.offset-e.cur) { - // We only check if value mismatches. - // Offset will always be invalid in other cases. + if candidate.offset > minOffset { + if uint32(cv) == load3232(src, candidate.offset-e.cur) { + // Found a match... + continue + } candidate = candidates.Prev - if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) { - offset := s - (candidate.offset - e.cur) - if offset <= maxMatchOffset { - continue - } + if candidate.offset > minOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) { + // Match at prev... + continue } } - cv = uint32(x >> 8) + cv = x >> 8 s++ break } diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go index e62f0c02b1e7..23c08b325cf7 100644 --- a/vendor/github.com/klauspost/compress/flate/level4.go +++ b/vendor/github.com/klauspost/compress/flate/level4.go @@ -12,6 +12,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { const ( inputMargin = 12 - 1 minNonLiteralBlockSize = 1 + 1 + inputMargin + hashShortBytes = 4 ) if debugDeflate && e.cur < 0 { panic(fmt.Sprint("e.cur < 0: ", e.cur)) @@ -80,7 +81,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { nextS := s var t int32 for { - nextHashS := hash4x64(cv, tableBits) + nextHashS := hashLen(cv, tableBits, hashShortBytes) nextHashL := hash7(cv, tableBits) s = nextS @@ -135,7 +136,15 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { l++ } if nextEmit < s { - emitLiteral(dst, src[nextEmit:s]) + if false { + emitLiteral(dst, src[nextEmit:s]) + } else { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } } if debugDeflate { if t >= s { @@ -160,7 +169,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { // Index first pair after match end. if int(s+8) < len(src) { cv := load6432(src, s) - e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur} + e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: s + e.cur} e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur} } goto emitRemainder @@ -175,7 +184,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { t2 := tableEntry{offset: t.offset + 1} e.bTable[hash7(cv, tableBits)] = t e.bTable[hash7(cv>>8, tableBits)] = t2 - e.table[hash4u(uint32(cv>>8), tableBits)] = t2 + e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 i += 3 for ; i < s-1; i += 3 { @@ -184,7 +193,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { t2 := tableEntry{offset: t.offset + 1} e.bTable[hash7(cv, tableBits)] = t e.bTable[hash7(cv>>8, tableBits)] = t2 - e.table[hash4u(uint32(cv>>8), tableBits)] = t2 + e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 } } } @@ -193,7 +202,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { // compression we first update the hash table at s-1 and at s. x := load6432(src, s-1) o := e.cur + s - 1 - prevHashS := hash4x64(x, tableBits) + prevHashS := hashLen(x, tableBits, hashShortBytes) prevHashL := hash7(x, tableBits) e.table[prevHashS] = tableEntry{offset: o} e.bTable[prevHashL] = tableEntry{offset: o} diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go index 293a3a320b7c..83ef50ba45f0 100644 --- a/vendor/github.com/klauspost/compress/flate/level5.go +++ b/vendor/github.com/klauspost/compress/flate/level5.go @@ -12,6 +12,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { const ( inputMargin = 12 - 1 minNonLiteralBlockSize = 1 + 1 + inputMargin + hashShortBytes = 4 ) if debugDeflate && e.cur < 0 { panic(fmt.Sprint("e.cur < 0: ", e.cur)) @@ -88,7 +89,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { var l int32 var t int32 for { - nextHashS := hash4x64(cv, tableBits) + nextHashS := hashLen(cv, tableBits, hashShortBytes) nextHashL := hash7(cv, tableBits) s = nextS @@ -105,7 +106,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { eLong := &e.bTable[nextHashL] eLong.Cur, eLong.Prev = entry, eLong.Cur - nextHashS = hash4x64(next, tableBits) + nextHashS = hashLen(next, tableBits, hashShortBytes) nextHashL = hash7(next, tableBits) t = lCandidate.Cur.offset - e.cur @@ -191,14 +192,21 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { // Try to locate a better match by checking the end of best match... if sAt := s + l; l < 30 && sAt < sLimit { + // Allow some bytes at the beginning to mismatch. + // Sweet spot is 2/3 bytes depending on input. + // 3 is only a little better when it is but sometimes a lot worse. + // The skipped bytes are tested in Extend backwards, + // and still picked up as part of the match if they do. + const skipBeginning = 2 eLong := e.bTable[hash7(load6432(src, sAt), tableBits)].Cur.offset - // Test current - t2 := eLong - e.cur - l - off := s - t2 + t2 := eLong - e.cur - l + skipBeginning + s2 := s + skipBeginning + off := s2 - t2 if t2 >= 0 && off < maxMatchOffset && off > 0 { - if l2 := e.matchlenLong(s, t2, src); l2 > l { + if l2 := e.matchlenLong(s2, t2, src); l2 > l { t = t2 l = l2 + s = s2 } } } @@ -210,7 +218,15 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { l++ } if nextEmit < s { - emitLiteral(dst, src[nextEmit:s]) + if false { + emitLiteral(dst, src[nextEmit:s]) + } else { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } } if debugDeflate { if t >= s { @@ -242,7 +258,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { if i < s-1 { cv := load6432(src, i) t := tableEntry{offset: i + e.cur} - e.table[hash4x64(cv, tableBits)] = t + e.table[hashLen(cv, tableBits, hashShortBytes)] = t eLong := &e.bTable[hash7(cv, tableBits)] eLong.Cur, eLong.Prev = t, eLong.Cur @@ -255,7 +271,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { // We only have enough bits for a short entry at i+2 cv >>= 8 t = tableEntry{offset: t.offset + 1} - e.table[hash4x64(cv, tableBits)] = t + e.table[hashLen(cv, tableBits, hashShortBytes)] = t // Skip one - otherwise we risk hitting 's' i += 4 @@ -265,7 +281,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { t2 := tableEntry{offset: t.offset + 1} eLong := &e.bTable[hash7(cv, tableBits)] eLong.Cur, eLong.Prev = t, eLong.Cur - e.table[hash4u(uint32(cv>>8), tableBits)] = t2 + e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 } } } @@ -274,7 +290,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { // compression we first update the hash table at s-1 and at s. x := load6432(src, s-1) o := e.cur + s - 1 - prevHashS := hash4x64(x, tableBits) + prevHashS := hashLen(x, tableBits, hashShortBytes) prevHashL := hash7(x, tableBits) e.table[prevHashS] = tableEntry{offset: o} eLong := &e.bTable[prevHashL] diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go index a709977ec49a..f1e9d98fa505 100644 --- a/vendor/github.com/klauspost/compress/flate/level6.go +++ b/vendor/github.com/klauspost/compress/flate/level6.go @@ -12,6 +12,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { const ( inputMargin = 12 - 1 minNonLiteralBlockSize = 1 + 1 + inputMargin + hashShortBytes = 4 ) if debugDeflate && e.cur < 0 { panic(fmt.Sprint("e.cur < 0: ", e.cur)) @@ -90,7 +91,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { var l int32 var t int32 for { - nextHashS := hash4x64(cv, tableBits) + nextHashS := hashLen(cv, tableBits, hashShortBytes) nextHashL := hash7(cv, tableBits) s = nextS nextS = s + doEvery + (s-nextEmit)>>skipLog @@ -107,7 +108,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { eLong.Cur, eLong.Prev = entry, eLong.Cur // Calculate hashes of 'next' - nextHashS = hash4x64(next, tableBits) + nextHashS = hashLen(next, tableBits, hashShortBytes) nextHashL = hash7(next, tableBits) t = lCandidate.Cur.offset - e.cur @@ -213,24 +214,33 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { // Try to locate a better match by checking the end-of-match... if sAt := s + l; sAt < sLimit { + // Allow some bytes at the beginning to mismatch. + // Sweet spot is 2/3 bytes depending on input. + // 3 is only a little better when it is but sometimes a lot worse. + // The skipped bytes are tested in Extend backwards, + // and still picked up as part of the match if they do. + const skipBeginning = 2 eLong := &e.bTable[hash7(load6432(src, sAt), tableBits)] // Test current - t2 := eLong.Cur.offset - e.cur - l - off := s - t2 + t2 := eLong.Cur.offset - e.cur - l + skipBeginning + s2 := s + skipBeginning + off := s2 - t2 if off < maxMatchOffset { if off > 0 && t2 >= 0 { - if l2 := e.matchlenLong(s, t2, src); l2 > l { + if l2 := e.matchlenLong(s2, t2, src); l2 > l { t = t2 l = l2 + s = s2 } } // Test next: - t2 = eLong.Prev.offset - e.cur - l - off := s - t2 + t2 = eLong.Prev.offset - e.cur - l + skipBeginning + off := s2 - t2 if off > 0 && off < maxMatchOffset && t2 >= 0 { - if l2 := e.matchlenLong(s, t2, src); l2 > l { + if l2 := e.matchlenLong(s2, t2, src); l2 > l { t = t2 l = l2 + s = s2 } } } @@ -243,7 +253,15 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { l++ } if nextEmit < s { - emitLiteral(dst, src[nextEmit:s]) + if false { + emitLiteral(dst, src[nextEmit:s]) + } else { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } } if false { if t >= s { @@ -269,7 +287,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { // Index after match end. for i := nextS + 1; i < int32(len(src))-8; i += 2 { cv := load6432(src, i) - e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur} + e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: i + e.cur} eLong := &e.bTable[hash7(cv, tableBits)] eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur } @@ -284,7 +302,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { t2 := tableEntry{offset: t.offset + 1} eLong := &e.bTable[hash7(cv, tableBits)] eLong2 := &e.bTable[hash7(cv>>8, tableBits)] - e.table[hash4x64(cv, tableBits)] = t + e.table[hashLen(cv, tableBits, hashShortBytes)] = t eLong.Cur, eLong.Prev = t, eLong.Cur eLong2.Cur, eLong2.Prev = t2, eLong2.Cur } diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go index 53e899124639..93a1d150312e 100644 --- a/vendor/github.com/klauspost/compress/flate/stateless.go +++ b/vendor/github.com/klauspost/compress/flate/stateless.go @@ -59,9 +59,9 @@ var bitWriterPool = sync.Pool{ }, } -// StatelessDeflate allows to compress directly to a Writer without retaining state. +// StatelessDeflate allows compressing directly to a Writer without retaining state. // When returning everything will be flushed. -// Up to 8KB of an optional dictionary can be given which is presumed to presumed to precede the block. +// Up to 8KB of an optional dictionary can be given which is presumed to precede the block. // Longer dictionaries will be truncated and will still produce valid output. // Sending nil dictionary is perfectly fine. func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error { @@ -249,7 +249,15 @@ func statelessEnc(dst *tokens, src []byte, startAt int16) { l++ } if nextEmit < s { - emitLiteral(dst, src[nextEmit:s]) + if false { + emitLiteral(dst, src[nextEmit:s]) + } else { + for _, v := range src[nextEmit:s] { + dst.tokens[dst.n] = token(v) + dst.litHist[v]++ + dst.n++ + } + } } // Save the match found diff --git a/vendor/github.com/klauspost/compress/flate/token.go b/vendor/github.com/klauspost/compress/flate/token.go index eb862d7a920d..d818790c1323 100644 --- a/vendor/github.com/klauspost/compress/flate/token.go +++ b/vendor/github.com/klauspost/compress/flate/token.go @@ -13,11 +13,10 @@ import ( ) const ( - // From top - // 2 bits: type 0 = literal 1=EOF 2=Match 3=Unused - // 8 bits: xlength = length - MIN_MATCH_LENGTH - // 5 bits offsetcode - // 16 bits xoffset = offset - MIN_OFFSET_SIZE, or literal + // bits 0-16 xoffset = offset - MIN_OFFSET_SIZE, or literal - 16 bits + // bits 16-22 offsetcode - 5 bits + // bits 22-30 xlength = length - MIN_MATCH_LENGTH - 8 bits + // bits 30-32 type 0 = literal 1=EOF 2=Match 3=Unused - 2 bits lengthShift = 22 offsetMask = 1<maxnumlit offHist [32]uint16 // offset codes litHist [256]uint16 // codes 0->255 - n uint16 // Must be able to contain maxStoreBlockSize + nFilled int + n uint16 // Must be able to contain maxStoreBlockSize tokens [maxStoreBlockSize + 1]token } @@ -142,7 +141,7 @@ func (t *tokens) Reset() { return } t.n = 0 - t.nLits = 0 + t.nFilled = 0 for i := range t.litHist[:] { t.litHist[i] = 0 } @@ -161,12 +160,12 @@ func (t *tokens) Fill() { for i, v := range t.litHist[:] { if v == 0 { t.litHist[i] = 1 - t.nLits++ + t.nFilled++ } } for i, v := range t.extraHist[:literalCount-256] { if v == 0 { - t.nLits++ + t.nFilled++ t.extraHist[i] = 1 } } @@ -196,20 +195,17 @@ func (t *tokens) indexTokens(in []token) { // emitLiteral writes a literal chunk and returns the number of bytes written. func emitLiteral(dst *tokens, lit []byte) { - ol := int(dst.n) - for i, v := range lit { - dst.tokens[(i+ol)&maxStoreBlockSize] = token(v) + for _, v := range lit { + dst.tokens[dst.n] = token(v) dst.litHist[v]++ + dst.n++ } - dst.n += uint16(len(lit)) - dst.nLits += len(lit) } func (t *tokens) AddLiteral(lit byte) { t.tokens[t.n] = token(lit) t.litHist[lit]++ t.n++ - t.nLits++ } // from https://stackoverflow.com/a/28730362 @@ -230,8 +226,9 @@ func (t *tokens) EstimatedBits() int { shannon := float32(0) bits := int(0) nMatches := 0 - if t.nLits > 0 { - invTotal := 1.0 / float32(t.nLits) + total := int(t.n) + t.nFilled + if total > 0 { + invTotal := 1.0 / float32(total) for _, v := range t.litHist[:] { if v > 0 { n := float32(v) @@ -275,10 +272,9 @@ func (t *tokens) AddMatch(xlength uint32, xoffset uint32) { } oCode := offsetCode(xoffset) xoffset |= oCode << 16 - t.nLits++ t.extraHist[lengthCodes1[uint8(xlength)]]++ - t.offHist[oCode]++ + t.offHist[oCode&31]++ t.tokens[t.n] = token(matchType | xlength< 258 { // We need to have at least baseMatchLength left over for next loop. - xl = 258 - baseMatchLength + if xl > 258+baseMatchLength { + xl = 258 + } else { + xl = 258 - baseMatchLength + } } xlength -= xl xl -= baseMatchLength - t.nLits++ t.extraHist[lengthCodes1[uint8(xl)]]++ - t.offHist[oc]++ + t.offHist[oc&31]++ t.tokens[t.n] = token(matchType | uint32(xl)<> lengthShift) } -// The code is never more than 8 bits, but is returned as uint32 for convenience. -func lengthCode(len uint8) uint32 { return uint32(lengthCodes[len]) } +// Convert length to code. +func lengthCode(len uint8) uint8 { return lengthCodes[len] } // Returns the offset code corresponding to a specific offset func offsetCode(off uint32) uint32 { diff --git a/vendor/github.com/klauspost/compress/gzip/gunzip.go b/vendor/github.com/klauspost/compress/gzip/gunzip.go index 4d7018913e29..66fe5ddf72ce 100644 --- a/vendor/github.com/klauspost/compress/gzip/gunzip.go +++ b/vendor/github.com/klauspost/compress/gzip/gunzip.go @@ -252,42 +252,40 @@ func (z *Reader) Read(p []byte) (n int, err error) { return 0, z.err } - n, z.err = z.decompressor.Read(p) - z.digest = crc32.Update(z.digest, crc32.IEEETable, p[:n]) - z.size += uint32(n) - if z.err != io.EOF { - // In the normal case we return here. - return n, z.err - } + for n == 0 { + n, z.err = z.decompressor.Read(p) + z.digest = crc32.Update(z.digest, crc32.IEEETable, p[:n]) + z.size += uint32(n) + if z.err != io.EOF { + // In the normal case we return here. + return n, z.err + } - // Finished file; check checksum and size. - if _, err := io.ReadFull(z.r, z.buf[:8]); err != nil { - z.err = noEOF(err) - return n, z.err - } - digest := le.Uint32(z.buf[:4]) - size := le.Uint32(z.buf[4:8]) - if digest != z.digest || size != z.size { - z.err = ErrChecksum - return n, z.err - } - z.digest, z.size = 0, 0 + // Finished file; check checksum and size. + if _, err := io.ReadFull(z.r, z.buf[:8]); err != nil { + z.err = noEOF(err) + return n, z.err + } + digest := le.Uint32(z.buf[:4]) + size := le.Uint32(z.buf[4:8]) + if digest != z.digest || size != z.size { + z.err = ErrChecksum + return n, z.err + } + z.digest, z.size = 0, 0 - // File is ok; check if there is another. - if !z.multistream { - return n, io.EOF - } - z.err = nil // Remove io.EOF + // File is ok; check if there is another. + if !z.multistream { + return n, io.EOF + } + z.err = nil // Remove io.EOF - if _, z.err = z.readHeader(); z.err != nil { - return n, z.err + if _, z.err = z.readHeader(); z.err != nil { + return n, z.err + } } - // Read from next file, if necessary. - if n > 0 { - return n, nil - } - return z.Read(p) + return n, nil } // Support the io.WriteTo interface for io.Copy and friends. diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go index a4979e8868a5..504a7be9dae3 100644 --- a/vendor/github.com/klauspost/compress/huff0/bitreader.go +++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go @@ -8,115 +8,10 @@ package huff0 import ( "encoding/binary" "errors" + "fmt" "io" ) -// bitReader reads a bitstream in reverse. -// The last set bit indicates the start of the stream and is used -// for aligning the input. -type bitReader struct { - in []byte - off uint // next byte to read is at in[off - 1] - value uint64 - bitsRead uint8 -} - -// init initializes and resets the bit reader. -func (b *bitReader) init(in []byte) error { - if len(in) < 1 { - return errors.New("corrupt stream: too short") - } - b.in = in - b.off = uint(len(in)) - // The highest bit of the last byte indicates where to start - v := in[len(in)-1] - if v == 0 { - return errors.New("corrupt stream, did not find end of stream") - } - b.bitsRead = 64 - b.value = 0 - if len(in) >= 8 { - b.fillFastStart() - } else { - b.fill() - b.fill() - } - b.bitsRead += 8 - uint8(highBit32(uint32(v))) - return nil -} - -// peekBitsFast requires that at least one bit is requested every time. -// There are no checks if the buffer is filled. -func (b *bitReader) peekBitsFast(n uint8) uint16 { - const regMask = 64 - 1 - v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask)) - return v -} - -// fillFast() will make sure at least 32 bits are available. -// There must be at least 4 bytes available. -func (b *bitReader) fillFast() { - if b.bitsRead < 32 { - return - } - - // 2 bounds checks. - v := b.in[b.off-4 : b.off] - v = v[:4] - low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - b.value = (b.value << 32) | uint64(low) - b.bitsRead -= 32 - b.off -= 4 -} - -func (b *bitReader) advance(n uint8) { - b.bitsRead += n -} - -// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read. -func (b *bitReader) fillFastStart() { - // Do single re-slice to avoid bounds checks. - b.value = binary.LittleEndian.Uint64(b.in[b.off-8:]) - b.bitsRead = 0 - b.off -= 8 -} - -// fill() will make sure at least 32 bits are available. -func (b *bitReader) fill() { - if b.bitsRead < 32 { - return - } - if b.off > 4 { - v := b.in[b.off-4:] - v = v[:4] - low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - b.value = (b.value << 32) | uint64(low) - b.bitsRead -= 32 - b.off -= 4 - return - } - for b.off > 0 { - b.value = (b.value << 8) | uint64(b.in[b.off-1]) - b.bitsRead -= 8 - b.off-- - } -} - -// finished returns true if all bits have been read from the bit stream. -func (b *bitReader) finished() bool { - return b.off == 0 && b.bitsRead >= 64 -} - -// close the bitstream and returns an error if out-of-buffer reads occurred. -func (b *bitReader) close() error { - // Release reference. - b.in = nil - if b.bitsRead > 64 { - return io.ErrUnexpectedEOF - } - return nil -} - // bitReader reads a bitstream in reverse. // The last set bit indicates the start of the stream and is used // for aligning the input. @@ -213,10 +108,17 @@ func (b *bitReaderBytes) finished() bool { return b.off == 0 && b.bitsRead >= 64 } +func (b *bitReaderBytes) remaining() uint { + return b.off*8 + uint(64-b.bitsRead) +} + // close the bitstream and returns an error if out-of-buffer reads occurred. func (b *bitReaderBytes) close() error { // Release reference. b.in = nil + if b.remaining() > 0 { + return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining()) + } if b.bitsRead > 64 { return io.ErrUnexpectedEOF } @@ -313,15 +215,17 @@ func (b *bitReaderShifted) fill() { } } -// finished returns true if all bits have been read from the bit stream. -func (b *bitReaderShifted) finished() bool { - return b.off == 0 && b.bitsRead >= 64 +func (b *bitReaderShifted) remaining() uint { + return b.off*8 + uint(64-b.bitsRead) } // close the bitstream and returns an error if out-of-buffer reads occurred. func (b *bitReaderShifted) close() error { // Release reference. b.in = nil + if b.remaining() > 0 { + return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining()) + } if b.bitsRead > 64 { return io.ErrUnexpectedEOF } diff --git a/vendor/github.com/klauspost/compress/huff0/bitwriter.go b/vendor/github.com/klauspost/compress/huff0/bitwriter.go index 6bce4e87d4ff..ec71f7a349a1 100644 --- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go +++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go @@ -5,8 +5,6 @@ package huff0 -import "fmt" - // bitWriter will write bits. // First bit will be LSB of the first byte of output. type bitWriter struct { @@ -23,14 +21,6 @@ var bitMask16 = [32]uint16{ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF} /* up to 16 bits */ -// addBits16NC will add up to 16 bits. -// It will not check if there is space for them, -// so the caller must ensure that it has flushed recently. -func (b *bitWriter) addBits16NC(value uint16, bits uint8) { - b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63) - b.nBits += bits -} - // addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated. // It will not check if there is space for them, so the caller must ensure that it has flushed recently. func (b *bitWriter) addBits16Clean(value uint16, bits uint8) { @@ -70,104 +60,6 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) { b.nBits += encA.nBits + encB.nBits } -// addBits16ZeroNC will add up to 16 bits. -// It will not check if there is space for them, -// so the caller must ensure that it has flushed recently. -// This is fastest if bits can be zero. -func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) { - if bits == 0 { - return - } - value <<= (16 - bits) & 15 - value >>= (16 - bits) & 15 - b.bitContainer |= uint64(value) << (b.nBits & 63) - b.nBits += bits -} - -// flush will flush all pending full bytes. -// There will be at least 56 bits available for writing when this has been called. -// Using flush32 is faster, but leaves less space for writing. -func (b *bitWriter) flush() { - v := b.nBits >> 3 - switch v { - case 0: - return - case 1: - b.out = append(b.out, - byte(b.bitContainer), - ) - b.bitContainer >>= 1 << 3 - case 2: - b.out = append(b.out, - byte(b.bitContainer), - byte(b.bitContainer>>8), - ) - b.bitContainer >>= 2 << 3 - case 3: - b.out = append(b.out, - byte(b.bitContainer), - byte(b.bitContainer>>8), - byte(b.bitContainer>>16), - ) - b.bitContainer >>= 3 << 3 - case 4: - b.out = append(b.out, - byte(b.bitContainer), - byte(b.bitContainer>>8), - byte(b.bitContainer>>16), - byte(b.bitContainer>>24), - ) - b.bitContainer >>= 4 << 3 - case 5: - b.out = append(b.out, - byte(b.bitContainer), - byte(b.bitContainer>>8), - byte(b.bitContainer>>16), - byte(b.bitContainer>>24), - byte(b.bitContainer>>32), - ) - b.bitContainer >>= 5 << 3 - case 6: - b.out = append(b.out, - byte(b.bitContainer), - byte(b.bitContainer>>8), - byte(b.bitContainer>>16), - byte(b.bitContainer>>24), - byte(b.bitContainer>>32), - byte(b.bitContainer>>40), - ) - b.bitContainer >>= 6 << 3 - case 7: - b.out = append(b.out, - byte(b.bitContainer), - byte(b.bitContainer>>8), - byte(b.bitContainer>>16), - byte(b.bitContainer>>24), - byte(b.bitContainer>>32), - byte(b.bitContainer>>40), - byte(b.bitContainer>>48), - ) - b.bitContainer >>= 7 << 3 - case 8: - b.out = append(b.out, - byte(b.bitContainer), - byte(b.bitContainer>>8), - byte(b.bitContainer>>16), - byte(b.bitContainer>>24), - byte(b.bitContainer>>32), - byte(b.bitContainer>>40), - byte(b.bitContainer>>48), - byte(b.bitContainer>>56), - ) - b.bitContainer = 0 - b.nBits = 0 - return - default: - panic(fmt.Errorf("bits (%d) > 64", b.nBits)) - } - b.nBits &= 7 -} - // flush32 will flush out, so there are at least 32 bits available for writing. func (b *bitWriter) flush32() { if b.nBits < 32 { @@ -201,10 +93,3 @@ func (b *bitWriter) close() error { b.flushAlign() return nil } - -// reset and continue writing by appending to out. -func (b *bitWriter) reset(out []byte) { - b.bitContainer = 0 - b.nBits = 0 - b.out = out -} diff --git a/vendor/github.com/klauspost/compress/huff0/bytereader.go b/vendor/github.com/klauspost/compress/huff0/bytereader.go index 50bcdf6ea99c..4dcab8d23277 100644 --- a/vendor/github.com/klauspost/compress/huff0/bytereader.go +++ b/vendor/github.com/klauspost/compress/huff0/bytereader.go @@ -20,11 +20,6 @@ func (b *byteReader) init(in []byte) { b.off = 0 } -// advance the stream b n bytes. -func (b *byteReader) advance(n uint) { - b.off += int(n) -} - // Int32 returns a little endian int32 starting at current offset. func (b byteReader) Int32() int32 { v3 := int32(b.b[b.off+3]) @@ -43,11 +38,6 @@ func (b byteReader) Uint32() uint32 { return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0 } -// unread returns the unread portion of the input. -func (b byteReader) unread() []byte { - return b.b[b.off:] -} - // remain will return the number of bytes remaining. func (b byteReader) remain() int { return len(b.b) - b.off diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go index 8323dc053890..4d14542facf3 100644 --- a/vendor/github.com/klauspost/compress/huff0/compress.go +++ b/vendor/github.com/klauspost/compress/huff0/compress.go @@ -2,6 +2,7 @@ package huff0 import ( "fmt" + "math" "runtime" "sync" ) @@ -289,6 +290,10 @@ func (s *Scratch) compress4X(src []byte) ([]byte, error) { if err != nil { return nil, err } + if len(s.Out)-idx > math.MaxUint16 { + // We cannot store the size in the jump table + return nil, ErrIncompressible + } // Write compressed length as little endian before block. if i < 3 { // Last length is not written. @@ -332,6 +337,10 @@ func (s *Scratch) compress4Xp(src []byte) ([]byte, error) { return nil, errs[i] } o := s.tmpOut[i] + if len(o) > math.MaxUint16 { + // We cannot store the size in the jump table + return nil, ErrIncompressible + } // Write compressed length as little endian before block. if i < 3 { // Last length is not written. @@ -395,6 +404,7 @@ func (s *Scratch) canUseTable(c cTable) bool { return true } +//lint:ignore U1000 used for debugging func (s *Scratch) validateTable(c cTable) bool { if len(c) < int(s.symbolLen) { return false diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go index 2a06bd1a7e52..42a237eac4ab 100644 --- a/vendor/github.com/klauspost/compress/huff0/decompress.go +++ b/vendor/github.com/klauspost/compress/huff0/decompress.go @@ -4,13 +4,13 @@ import ( "errors" "fmt" "io" + "sync" "github.com/klauspost/compress/fse" ) type dTable struct { single []dEntrySingle - double []dEntryDouble } // single-symbols decoding @@ -18,13 +18,6 @@ type dEntrySingle struct { entry uint16 } -// double-symbols decoding -type dEntryDouble struct { - seq [4]byte - nBits uint8 - len uint8 -} - // Uses special code for all tables that are < 8 bits. const use8BitTables = true @@ -34,7 +27,7 @@ const use8BitTables = true // If no Scratch is provided a new one is allocated. // The returned Scratch can be used for encoding or decoding input using this table. func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) { - s, err = s.prepare(in) + s, err = s.prepare(nil) if err != nil { return s, nil, err } @@ -216,6 +209,7 @@ func (s *Scratch) Decoder() *Decoder { return &Decoder{ dt: s.dt, actualTableLog: s.actualTableLog, + bufs: &s.decPool, } } @@ -223,103 +217,15 @@ func (s *Scratch) Decoder() *Decoder { type Decoder struct { dt dTable actualTableLog uint8 + bufs *sync.Pool } -// Decompress1X will decompress a 1X encoded stream. -// The cap of the output buffer will be the maximum decompressed size. -// The length of the supplied input must match the end of a block exactly. -func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) { - if len(d.dt.single) == 0 { - return nil, errors.New("no table loaded") - } - if use8BitTables && d.actualTableLog <= 8 { - return d.decompress1X8Bit(dst, src) - } - var br bitReaderShifted - err := br.init(src) - if err != nil { - return dst, err - } - maxDecodedSize := cap(dst) - dst = dst[:0] - - // Avoid bounds check by always having full sized table. - const tlSize = 1 << tableLogMax - const tlMask = tlSize - 1 - dt := d.dt.single[:tlSize] - - // Use temp table to avoid bound checks/append penalty. - var buf [256]byte - var off uint8 - - for br.off >= 8 { - br.fillFast() - v := dt[br.peekBitsFast(d.actualTableLog)&tlMask] - br.advance(uint8(v.entry)) - buf[off+0] = uint8(v.entry >> 8) - - v = dt[br.peekBitsFast(d.actualTableLog)&tlMask] - br.advance(uint8(v.entry)) - buf[off+1] = uint8(v.entry >> 8) - - // Refill - br.fillFast() - - v = dt[br.peekBitsFast(d.actualTableLog)&tlMask] - br.advance(uint8(v.entry)) - buf[off+2] = uint8(v.entry >> 8) - - v = dt[br.peekBitsFast(d.actualTableLog)&tlMask] - br.advance(uint8(v.entry)) - buf[off+3] = uint8(v.entry >> 8) - - off += 4 - if off == 0 { - if len(dst)+256 > maxDecodedSize { - br.close() - return nil, ErrMaxDecodedSizeExceeded - } - dst = append(dst, buf[:]...) - } - } - - if len(dst)+int(off) > maxDecodedSize { - br.close() - return nil, ErrMaxDecodedSizeExceeded - } - dst = append(dst, buf[:off]...) - - // br < 8, so uint8 is fine - bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead - for bitsLeft > 0 { - br.fill() - if false && br.bitsRead >= 32 { - if br.off >= 4 { - v := br.in[br.off-4:] - v = v[:4] - low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - br.value = (br.value << 32) | uint64(low) - br.bitsRead -= 32 - br.off -= 4 - } else { - for br.off > 0 { - br.value = (br.value << 8) | uint64(br.in[br.off-1]) - br.bitsRead -= 8 - br.off-- - } - } - } - if len(dst) >= maxDecodedSize { - br.close() - return nil, ErrMaxDecodedSizeExceeded - } - v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask] - nBits := uint8(v.entry) - br.advance(nBits) - bitsLeft -= nBits - dst = append(dst, uint8(v.entry>>8)) +func (d *Decoder) buffer() *[4][256]byte { + buf, ok := d.bufs.Get().(*[4][256]byte) + if ok { + return buf } - return dst, br.close() + return &[4][256]byte{} } // decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8. @@ -341,7 +247,8 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { dt := d.dt.single[:256] // Use temp table to avoid bound checks/append penalty. - var buf [256]byte + bufs := d.buffer() + buf := &bufs[0] var off uint8 switch d.actualTableLog { @@ -369,6 +276,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { if off == 0 { if len(dst)+256 > maxDecodedSize { br.close() + d.bufs.Put(bufs) return nil, ErrMaxDecodedSizeExceeded } dst = append(dst, buf[:]...) @@ -398,6 +306,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { if off == 0 { if len(dst)+256 > maxDecodedSize { br.close() + d.bufs.Put(bufs) return nil, ErrMaxDecodedSizeExceeded } dst = append(dst, buf[:]...) @@ -426,6 +335,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -455,6 +365,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -484,6 +395,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -513,6 +425,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -542,6 +455,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -571,6 +485,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -578,10 +493,12 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { } } default: + d.bufs.Put(bufs) return nil, fmt.Errorf("invalid tablelog: %d", d.actualTableLog) } if len(dst)+int(off) > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -601,6 +518,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { } if len(dst) >= maxDecodedSize { br.close() + d.bufs.Put(bufs) return nil, ErrMaxDecodedSizeExceeded } v := dt[br.peekByteFast()>>shift] @@ -609,6 +527,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) { bitsLeft -= int8(nBits) dst = append(dst, uint8(v.entry>>8)) } + d.bufs.Put(bufs) return dst, br.close() } @@ -628,7 +547,8 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { dt := d.dt.single[:256] // Use temp table to avoid bound checks/append penalty. - var buf [256]byte + bufs := d.buffer() + buf := &bufs[0] var off uint8 const shift = 56 @@ -655,6 +575,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { off += 4 if off == 0 { if len(dst)+256 > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -663,6 +584,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { } if len(dst)+int(off) > maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -679,6 +601,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { } } if len(dst) >= maxDecodedSize { + d.bufs.Put(bufs) br.close() return nil, ErrMaxDecodedSizeExceeded } @@ -688,195 +611,10 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) { bitsLeft -= int8(nBits) dst = append(dst, uint8(v.entry>>8)) } + d.bufs.Put(bufs) return dst, br.close() } -// Decompress4X will decompress a 4X encoded stream. -// The length of the supplied input must match the end of a block exactly. -// The *capacity* of the dst slice must match the destination size of -// the uncompressed data exactly. -func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { - if len(d.dt.single) == 0 { - return nil, errors.New("no table loaded") - } - if len(src) < 6+(4*1) { - return nil, errors.New("input too small") - } - if use8BitTables && d.actualTableLog <= 8 { - return d.decompress4X8bit(dst, src) - } - - var br [4]bitReaderShifted - start := 6 - for i := 0; i < 3; i++ { - length := int(src[i*2]) | (int(src[i*2+1]) << 8) - if start+length >= len(src) { - return nil, errors.New("truncated input (or invalid offset)") - } - err := br[i].init(src[start : start+length]) - if err != nil { - return nil, err - } - start += length - } - err := br[3].init(src[start:]) - if err != nil { - return nil, err - } - - // destination, offset to match first output - dstSize := cap(dst) - dst = dst[:dstSize] - out := dst - dstEvery := (dstSize + 3) / 4 - - const tlSize = 1 << tableLogMax - const tlMask = tlSize - 1 - single := d.dt.single[:tlSize] - - // Use temp table to avoid bound checks/append penalty. - var buf [256]byte - var off uint8 - var decoded int - - // Decode 2 values from each decoder/loop. - const bufoff = 256 / 4 - for { - if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 { - break - } - - { - const stream = 0 - const stream2 = 1 - br[stream].fillFast() - br[stream2].fillFast() - - val := br[stream].peekBitsFast(d.actualTableLog) - val2 := br[stream2].peekBitsFast(d.actualTableLog) - v := single[val&tlMask] - v2 := single[val2&tlMask] - br[stream].advance(uint8(v.entry)) - br[stream2].advance(uint8(v2.entry)) - buf[off+bufoff*stream] = uint8(v.entry >> 8) - buf[off+bufoff*stream2] = uint8(v2.entry >> 8) - - val = br[stream].peekBitsFast(d.actualTableLog) - val2 = br[stream2].peekBitsFast(d.actualTableLog) - v = single[val&tlMask] - v2 = single[val2&tlMask] - br[stream].advance(uint8(v.entry)) - br[stream2].advance(uint8(v2.entry)) - buf[off+bufoff*stream+1] = uint8(v.entry >> 8) - buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8) - } - - { - const stream = 2 - const stream2 = 3 - br[stream].fillFast() - br[stream2].fillFast() - - val := br[stream].peekBitsFast(d.actualTableLog) - val2 := br[stream2].peekBitsFast(d.actualTableLog) - v := single[val&tlMask] - v2 := single[val2&tlMask] - br[stream].advance(uint8(v.entry)) - br[stream2].advance(uint8(v2.entry)) - buf[off+bufoff*stream] = uint8(v.entry >> 8) - buf[off+bufoff*stream2] = uint8(v2.entry >> 8) - - val = br[stream].peekBitsFast(d.actualTableLog) - val2 = br[stream2].peekBitsFast(d.actualTableLog) - v = single[val&tlMask] - v2 = single[val2&tlMask] - br[stream].advance(uint8(v.entry)) - br[stream2].advance(uint8(v2.entry)) - buf[off+bufoff*stream+1] = uint8(v.entry >> 8) - buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8) - } - - off += 2 - - if off == bufoff { - if bufoff > dstEvery { - return nil, errors.New("corruption detected: stream overrun 1") - } - copy(out, buf[:bufoff]) - copy(out[dstEvery:], buf[bufoff:bufoff*2]) - copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3]) - copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4]) - off = 0 - out = out[bufoff:] - decoded += 256 - // There must at least be 3 buffers left. - if len(out) < dstEvery*3 { - return nil, errors.New("corruption detected: stream overrun 2") - } - } - } - if off > 0 { - ioff := int(off) - if len(out) < dstEvery*3+ioff { - return nil, errors.New("corruption detected: stream overrun 3") - } - copy(out, buf[:off]) - copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2]) - copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3]) - copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4]) - decoded += int(off) * 4 - out = out[off:] - } - - // Decode remaining. - for i := range br { - offset := dstEvery * i - br := &br[i] - bitsLeft := br.off*8 + uint(64-br.bitsRead) - for bitsLeft > 0 { - br.fill() - if false && br.bitsRead >= 32 { - if br.off >= 4 { - v := br.in[br.off-4:] - v = v[:4] - low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) - br.value = (br.value << 32) | uint64(low) - br.bitsRead -= 32 - br.off -= 4 - } else { - for br.off > 0 { - br.value = (br.value << 8) | uint64(br.in[br.off-1]) - br.bitsRead -= 8 - br.off-- - } - } - } - // end inline... - if offset >= len(out) { - return nil, errors.New("corruption detected: stream overrun 4") - } - - // Read value and increment offset. - val := br.peekBitsFast(d.actualTableLog) - v := single[val&tlMask].entry - nBits := uint8(v) - br.advance(nBits) - bitsLeft -= uint(nBits) - out[offset] = uint8(v >> 8) - offset++ - } - decoded += offset - dstEvery*i - err = br.close() - if err != nil { - return nil, err - } - } - if dstSize != decoded { - return nil, errors.New("corruption detected: short output block") - } - return dst, nil -} - // Decompress4X will decompress a 4X encoded stream. // The length of the supplied input must match the end of a block exactly. // The *capacity* of the dst slice must match the destination size of @@ -916,12 +654,12 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { single := d.dt.single[:tlSize] // Use temp table to avoid bound checks/append penalty. - var buf [256]byte + buf := d.buffer() var off uint8 var decoded int // Decode 4 values from each decoder/loop. - const bufoff = 256 / 4 + const bufoff = 256 for { if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 { break @@ -942,8 +680,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream] = uint8(v >> 8) - buf[off+bufoff*stream2] = uint8(v2 >> 8) + buf[stream][off] = uint8(v >> 8) + buf[stream2][off] = uint8(v2 >> 8) v = single[uint8(br1.value>>shift)].entry v2 = single[uint8(br2.value>>shift)].entry @@ -951,8 +689,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream+1] = uint8(v >> 8) - buf[off+bufoff*stream2+1] = uint8(v2 >> 8) + buf[stream][off+1] = uint8(v >> 8) + buf[stream2][off+1] = uint8(v2 >> 8) v = single[uint8(br1.value>>shift)].entry v2 = single[uint8(br2.value>>shift)].entry @@ -960,8 +698,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream+2] = uint8(v >> 8) - buf[off+bufoff*stream2+2] = uint8(v2 >> 8) + buf[stream][off+2] = uint8(v >> 8) + buf[stream2][off+2] = uint8(v2 >> 8) v = single[uint8(br1.value>>shift)].entry v2 = single[uint8(br2.value>>shift)].entry @@ -969,8 +707,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream2+3] = uint8(v2 >> 8) - buf[off+bufoff*stream+3] = uint8(v >> 8) + buf[stream][off+3] = uint8(v >> 8) + buf[stream2][off+3] = uint8(v2 >> 8) } { @@ -987,8 +725,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream] = uint8(v >> 8) - buf[off+bufoff*stream2] = uint8(v2 >> 8) + buf[stream][off] = uint8(v >> 8) + buf[stream2][off] = uint8(v2 >> 8) v = single[uint8(br1.value>>shift)].entry v2 = single[uint8(br2.value>>shift)].entry @@ -996,8 +734,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream+1] = uint8(v >> 8) - buf[off+bufoff*stream2+1] = uint8(v2 >> 8) + buf[stream][off+1] = uint8(v >> 8) + buf[stream2][off+1] = uint8(v2 >> 8) v = single[uint8(br1.value>>shift)].entry v2 = single[uint8(br2.value>>shift)].entry @@ -1005,8 +743,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream+2] = uint8(v >> 8) - buf[off+bufoff*stream2+2] = uint8(v2 >> 8) + buf[stream][off+2] = uint8(v >> 8) + buf[stream2][off+2] = uint8(v2 >> 8) v = single[uint8(br1.value>>shift)].entry v2 = single[uint8(br2.value>>shift)].entry @@ -1014,49 +752,61 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { br1.value <<= v & 63 br2.bitsRead += uint8(v2) br2.value <<= v2 & 63 - buf[off+bufoff*stream2+3] = uint8(v2 >> 8) - buf[off+bufoff*stream+3] = uint8(v >> 8) + buf[stream][off+3] = uint8(v >> 8) + buf[stream2][off+3] = uint8(v2 >> 8) } off += 4 - if off == bufoff { + if off == 0 { if bufoff > dstEvery { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 1") } - copy(out, buf[:bufoff]) - copy(out[dstEvery:], buf[bufoff:bufoff*2]) - copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3]) - copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4]) - off = 0 - out = out[bufoff:] - decoded += 256 // There must at least be 3 buffers left. - if len(out) < dstEvery*3 { + if len(out)-bufoff < dstEvery*3 { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 2") } + //copy(out, buf[0][:]) + //copy(out[dstEvery:], buf[1][:]) + //copy(out[dstEvery*2:], buf[2][:]) + *(*[bufoff]byte)(out) = buf[0] + *(*[bufoff]byte)(out[dstEvery:]) = buf[1] + *(*[bufoff]byte)(out[dstEvery*2:]) = buf[2] + *(*[bufoff]byte)(out[dstEvery*3:]) = buf[3] + out = out[bufoff:] + decoded += bufoff * 4 } } if off > 0 { ioff := int(off) if len(out) < dstEvery*3+ioff { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 3") } - copy(out, buf[:off]) - copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2]) - copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3]) - copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4]) + copy(out, buf[0][:off]) + copy(out[dstEvery:], buf[1][:off]) + copy(out[dstEvery*2:], buf[2][:off]) + copy(out[dstEvery*3:], buf[3][:off]) decoded += int(off) * 4 out = out[off:] } // Decode remaining. + // Decode remaining. + remainBytes := dstEvery - (decoded / 4) for i := range br { offset := dstEvery * i + endsAt := offset + remainBytes + if endsAt > len(out) { + endsAt = len(out) + } br := &br[i] - bitsLeft := int(br.off*8) + int(64-br.bitsRead) + bitsLeft := br.remaining() for bitsLeft > 0 { if br.finished() { + d.bufs.Put(buf) return nil, io.ErrUnexpectedEOF } if br.bitsRead >= 56 { @@ -1076,7 +826,8 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { } } // end inline... - if offset >= len(out) { + if offset >= endsAt { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 4") } @@ -1084,16 +835,22 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) { v := single[uint8(br.value>>shift)].entry nBits := uint8(v) br.advance(nBits) - bitsLeft -= int(nBits) + bitsLeft -= uint(nBits) out[offset] = uint8(v >> 8) offset++ } + if offset != endsAt { + d.bufs.Put(buf) + return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt) + } decoded += offset - dstEvery*i err = br.close() if err != nil { + d.bufs.Put(buf) return nil, err } } + d.bufs.Put(buf) if dstSize != decoded { return nil, errors.New("corruption detected: short output block") } @@ -1131,16 +888,15 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { const shift = 56 const tlSize = 1 << 8 - const tlMask = tlSize - 1 single := d.dt.single[:tlSize] // Use temp table to avoid bound checks/append penalty. - var buf [256]byte + buf := d.buffer() var off uint8 var decoded int // Decode 4 values from each decoder/loop. - const bufoff = 256 / 4 + const bufoff = 256 for { if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 { break @@ -1150,106 +906,116 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { // Interleave 2 decodes. const stream = 0 const stream2 = 1 - br[stream].fillFast() - br[stream2].fillFast() - - v := single[uint8(br[stream].value>>shift)].entry - v2 := single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream] = uint8(v >> 8) - buf[off+bufoff*stream2] = uint8(v2 >> 8) - - v = single[uint8(br[stream].value>>shift)].entry - v2 = single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream+1] = uint8(v >> 8) - buf[off+bufoff*stream2+1] = uint8(v2 >> 8) - - v = single[uint8(br[stream].value>>shift)].entry - v2 = single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream+2] = uint8(v >> 8) - buf[off+bufoff*stream2+2] = uint8(v2 >> 8) - - v = single[uint8(br[stream].value>>shift)].entry - v2 = single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream+3] = uint8(v >> 8) - buf[off+bufoff*stream2+3] = uint8(v2 >> 8) + br1 := &br[stream] + br2 := &br[stream2] + br1.fillFast() + br2.fillFast() + + v := single[uint8(br1.value>>shift)].entry + v2 := single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off] = uint8(v >> 8) + buf[stream2][off] = uint8(v2 >> 8) + + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off+1] = uint8(v >> 8) + buf[stream2][off+1] = uint8(v2 >> 8) + + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off+2] = uint8(v >> 8) + buf[stream2][off+2] = uint8(v2 >> 8) + + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off+3] = uint8(v >> 8) + buf[stream2][off+3] = uint8(v2 >> 8) } { const stream = 2 const stream2 = 3 - br[stream].fillFast() - br[stream2].fillFast() - - v := single[uint8(br[stream].value>>shift)].entry - v2 := single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream] = uint8(v >> 8) - buf[off+bufoff*stream2] = uint8(v2 >> 8) - - v = single[uint8(br[stream].value>>shift)].entry - v2 = single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream+1] = uint8(v >> 8) - buf[off+bufoff*stream2+1] = uint8(v2 >> 8) - - v = single[uint8(br[stream].value>>shift)].entry - v2 = single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream+2] = uint8(v >> 8) - buf[off+bufoff*stream2+2] = uint8(v2 >> 8) - - v = single[uint8(br[stream].value>>shift)].entry - v2 = single[uint8(br[stream2].value>>shift)].entry - br[stream].bitsRead += uint8(v) - br[stream].value <<= v & 63 - br[stream2].bitsRead += uint8(v2) - br[stream2].value <<= v2 & 63 - buf[off+bufoff*stream+3] = uint8(v >> 8) - buf[off+bufoff*stream2+3] = uint8(v2 >> 8) + br1 := &br[stream] + br2 := &br[stream2] + br1.fillFast() + br2.fillFast() + + v := single[uint8(br1.value>>shift)].entry + v2 := single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off] = uint8(v >> 8) + buf[stream2][off] = uint8(v2 >> 8) + + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off+1] = uint8(v >> 8) + buf[stream2][off+1] = uint8(v2 >> 8) + + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off+2] = uint8(v >> 8) + buf[stream2][off+2] = uint8(v2 >> 8) + + v = single[uint8(br1.value>>shift)].entry + v2 = single[uint8(br2.value>>shift)].entry + br1.bitsRead += uint8(v) + br1.value <<= v & 63 + br2.bitsRead += uint8(v2) + br2.value <<= v2 & 63 + buf[stream][off+3] = uint8(v >> 8) + buf[stream2][off+3] = uint8(v2 >> 8) } off += 4 - if off == bufoff { + if off == 0 { if bufoff > dstEvery { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 1") } - copy(out, buf[:bufoff]) - copy(out[dstEvery:], buf[bufoff:bufoff*2]) - copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3]) - copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4]) - off = 0 - out = out[bufoff:] - decoded += 256 // There must at least be 3 buffers left. - if len(out) < dstEvery*3 { + if len(out)-bufoff < dstEvery*3 { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 2") } + + //copy(out, buf[0][:]) + //copy(out[dstEvery:], buf[1][:]) + //copy(out[dstEvery*2:], buf[2][:]) + // copy(out[dstEvery*3:], buf[3][:]) + *(*[bufoff]byte)(out) = buf[0] + *(*[bufoff]byte)(out[dstEvery:]) = buf[1] + *(*[bufoff]byte)(out[dstEvery*2:]) = buf[2] + *(*[bufoff]byte)(out[dstEvery*3:]) = buf[3] + out = out[bufoff:] + decoded += bufoff * 4 } } if off > 0 { @@ -1257,21 +1023,27 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { if len(out) < dstEvery*3+ioff { return nil, errors.New("corruption detected: stream overrun 3") } - copy(out, buf[:off]) - copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2]) - copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3]) - copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4]) + copy(out, buf[0][:off]) + copy(out[dstEvery:], buf[1][:off]) + copy(out[dstEvery*2:], buf[2][:off]) + copy(out[dstEvery*3:], buf[3][:off]) decoded += int(off) * 4 out = out[off:] } // Decode remaining. + remainBytes := dstEvery - (decoded / 4) for i := range br { offset := dstEvery * i + endsAt := offset + remainBytes + if endsAt > len(out) { + endsAt = len(out) + } br := &br[i] - bitsLeft := int(br.off*8) + int(64-br.bitsRead) + bitsLeft := br.remaining() for bitsLeft > 0 { if br.finished() { + d.bufs.Put(buf) return nil, io.ErrUnexpectedEOF } if br.bitsRead >= 56 { @@ -1291,7 +1063,8 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { } } // end inline... - if offset >= len(out) { + if offset >= endsAt { + d.bufs.Put(buf) return nil, errors.New("corruption detected: stream overrun 4") } @@ -1299,16 +1072,23 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) { v := single[br.peekByteFast()].entry nBits := uint8(v) br.advance(nBits) - bitsLeft -= int(nBits) + bitsLeft -= uint(nBits) out[offset] = uint8(v >> 8) offset++ } + if offset != endsAt { + d.bufs.Put(buf) + return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt) + } + decoded += offset - dstEvery*i err = br.close() if err != nil { + d.bufs.Put(buf) return nil, err } } + d.bufs.Put(buf) if dstSize != decoded { return nil, errors.New("corruption detected: short output block") } diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go new file mode 100644 index 000000000000..ba7e8e6b0276 --- /dev/null +++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go @@ -0,0 +1,226 @@ +//go:build amd64 && !appengine && !noasm && gc +// +build amd64,!appengine,!noasm,gc + +// This file contains the specialisation of Decoder.Decompress4X +// and Decoder.Decompress1X that use an asm implementation of thir main loops. +package huff0 + +import ( + "errors" + "fmt" + + "github.com/klauspost/compress/internal/cpuinfo" +) + +// decompress4x_main_loop_x86 is an x86 assembler implementation +// of Decompress4X when tablelog > 8. +// +//go:noescape +func decompress4x_main_loop_amd64(ctx *decompress4xContext) + +// decompress4x_8b_loop_x86 is an x86 assembler implementation +// of Decompress4X when tablelog <= 8 which decodes 4 entries +// per loop. +// +//go:noescape +func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) + +// fallback8BitSize is the size where using Go version is faster. +const fallback8BitSize = 800 + +type decompress4xContext struct { + pbr *[4]bitReaderShifted + peekBits uint8 + out *byte + dstEvery int + tbl *dEntrySingle + decoded int + limit *byte +} + +// Decompress4X will decompress a 4X encoded stream. +// The length of the supplied input must match the end of a block exactly. +// The *capacity* of the dst slice must match the destination size of +// the uncompressed data exactly. +func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { + if len(d.dt.single) == 0 { + return nil, errors.New("no table loaded") + } + if len(src) < 6+(4*1) { + return nil, errors.New("input too small") + } + + use8BitTables := d.actualTableLog <= 8 + if cap(dst) < fallback8BitSize && use8BitTables { + return d.decompress4X8bit(dst, src) + } + + var br [4]bitReaderShifted + // Decode "jump table" + start := 6 + for i := 0; i < 3; i++ { + length := int(src[i*2]) | (int(src[i*2+1]) << 8) + if start+length >= len(src) { + return nil, errors.New("truncated input (or invalid offset)") + } + err := br[i].init(src[start : start+length]) + if err != nil { + return nil, err + } + start += length + } + err := br[3].init(src[start:]) + if err != nil { + return nil, err + } + + // destination, offset to match first output + dstSize := cap(dst) + dst = dst[:dstSize] + out := dst + dstEvery := (dstSize + 3) / 4 + + const tlSize = 1 << tableLogMax + const tlMask = tlSize - 1 + single := d.dt.single[:tlSize] + + var decoded int + + if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) { + ctx := decompress4xContext{ + pbr: &br, + peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast() + out: &out[0], + dstEvery: dstEvery, + tbl: &single[0], + limit: &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last. + } + if use8BitTables { + decompress4x_8b_main_loop_amd64(&ctx) + } else { + decompress4x_main_loop_amd64(&ctx) + } + + decoded = ctx.decoded + out = out[decoded/4:] + } + + // Decode remaining. + remainBytes := dstEvery - (decoded / 4) + for i := range br { + offset := dstEvery * i + endsAt := offset + remainBytes + if endsAt > len(out) { + endsAt = len(out) + } + br := &br[i] + bitsLeft := br.remaining() + for bitsLeft > 0 { + br.fill() + if offset >= endsAt { + return nil, errors.New("corruption detected: stream overrun 4") + } + + // Read value and increment offset. + val := br.peekBitsFast(d.actualTableLog) + v := single[val&tlMask].entry + nBits := uint8(v) + br.advance(nBits) + bitsLeft -= uint(nBits) + out[offset] = uint8(v >> 8) + offset++ + } + if offset != endsAt { + return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt) + } + decoded += offset - dstEvery*i + err = br.close() + if err != nil { + return nil, err + } + } + if dstSize != decoded { + return nil, errors.New("corruption detected: short output block") + } + return dst, nil +} + +// decompress4x_main_loop_x86 is an x86 assembler implementation +// of Decompress1X when tablelog > 8. +// +//go:noescape +func decompress1x_main_loop_amd64(ctx *decompress1xContext) + +// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation +// of Decompress1X when tablelog > 8. +// +//go:noescape +func decompress1x_main_loop_bmi2(ctx *decompress1xContext) + +type decompress1xContext struct { + pbr *bitReaderShifted + peekBits uint8 + out *byte + outCap int + tbl *dEntrySingle + decoded int +} + +// Error reported by asm implementations +const error_max_decoded_size_exeeded = -1 + +// Decompress1X will decompress a 1X encoded stream. +// The cap of the output buffer will be the maximum decompressed size. +// The length of the supplied input must match the end of a block exactly. +func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) { + if len(d.dt.single) == 0 { + return nil, errors.New("no table loaded") + } + var br bitReaderShifted + err := br.init(src) + if err != nil { + return dst, err + } + maxDecodedSize := cap(dst) + dst = dst[:maxDecodedSize] + + const tlSize = 1 << tableLogMax + const tlMask = tlSize - 1 + + if maxDecodedSize >= 4 { + ctx := decompress1xContext{ + pbr: &br, + out: &dst[0], + outCap: maxDecodedSize, + peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast() + tbl: &d.dt.single[0], + } + + if cpuinfo.HasBMI2() { + decompress1x_main_loop_bmi2(&ctx) + } else { + decompress1x_main_loop_amd64(&ctx) + } + if ctx.decoded == error_max_decoded_size_exeeded { + return nil, ErrMaxDecodedSizeExceeded + } + + dst = dst[:ctx.decoded] + } + + // br < 8, so uint8 is fine + bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead + for bitsLeft > 0 { + br.fill() + if len(dst) >= maxDecodedSize { + br.close() + return nil, ErrMaxDecodedSizeExceeded + } + v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask] + nBits := uint8(v.entry) + br.advance(nBits) + bitsLeft -= nBits + dst = append(dst, uint8(v.entry>>8)) + } + return dst, br.close() +} diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s new file mode 100644 index 000000000000..8d2187a2ce6a --- /dev/null +++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s @@ -0,0 +1,846 @@ +// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT. + +//go:build amd64 && !appengine && !noasm && gc + +// func decompress4x_main_loop_amd64(ctx *decompress4xContext) +TEXT ·decompress4x_main_loop_amd64(SB), $0-8 + XORQ DX, DX + + // Preload values + MOVQ ctx+0(FP), AX + MOVBQZX 8(AX), DI + MOVQ 16(AX), SI + MOVQ 48(AX), BX + MOVQ 24(AX), R9 + MOVQ 32(AX), R10 + MOVQ (AX), R11 + + // Main loop +main_loop: + MOVQ SI, R8 + CMPQ R8, BX + SETGE DL + + // br0.fillFast32() + MOVQ 32(R11), R12 + MOVBQZX 40(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill0 + MOVQ 24(R11), AX + SUBQ $0x20, R13 + SUBQ $0x04, AX + MOVQ (R11), R14 + + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 24(R11) + ORQ R14, R12 + + // exhausted = exhausted || (br0.off < 4) + CMPQ AX, $0x04 + SETLT AL + ORB AL, DL + +skip_fill0: + // val0 := br0.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v0 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br0.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R12 + ADDB CL, R13 + + // val1 := br0.peekTopBits(peekBits) + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 + + // v1 := table[val1&mask] + MOVW (R10)(R14*2), CX + + // br0.advance(uint8(v1.entry)) + MOVB CH, AH + SHLQ CL, R12 + ADDB CL, R13 + + // these two writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + MOVW AX, (R8) + + // update the bitreader structure + MOVQ R12, 32(R11) + MOVB R13, 40(R11) + ADDQ R9, R8 + + // br1.fillFast32() + MOVQ 80(R11), R12 + MOVBQZX 88(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill1 + MOVQ 72(R11), AX + SUBQ $0x20, R13 + SUBQ $0x04, AX + MOVQ 48(R11), R14 + + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 72(R11) + ORQ R14, R12 + + // exhausted = exhausted || (br1.off < 4) + CMPQ AX, $0x04 + SETLT AL + ORB AL, DL + +skip_fill1: + // val0 := br1.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v0 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br1.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R12 + ADDB CL, R13 + + // val1 := br1.peekTopBits(peekBits) + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 + + // v1 := table[val1&mask] + MOVW (R10)(R14*2), CX + + // br1.advance(uint8(v1.entry)) + MOVB CH, AH + SHLQ CL, R12 + ADDB CL, R13 + + // these two writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + MOVW AX, (R8) + + // update the bitreader structure + MOVQ R12, 80(R11) + MOVB R13, 88(R11) + ADDQ R9, R8 + + // br2.fillFast32() + MOVQ 128(R11), R12 + MOVBQZX 136(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill2 + MOVQ 120(R11), AX + SUBQ $0x20, R13 + SUBQ $0x04, AX + MOVQ 96(R11), R14 + + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 120(R11) + ORQ R14, R12 + + // exhausted = exhausted || (br2.off < 4) + CMPQ AX, $0x04 + SETLT AL + ORB AL, DL + +skip_fill2: + // val0 := br2.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v0 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br2.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R12 + ADDB CL, R13 + + // val1 := br2.peekTopBits(peekBits) + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 + + // v1 := table[val1&mask] + MOVW (R10)(R14*2), CX + + // br2.advance(uint8(v1.entry)) + MOVB CH, AH + SHLQ CL, R12 + ADDB CL, R13 + + // these two writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + MOVW AX, (R8) + + // update the bitreader structure + MOVQ R12, 128(R11) + MOVB R13, 136(R11) + ADDQ R9, R8 + + // br3.fillFast32() + MOVQ 176(R11), R12 + MOVBQZX 184(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill3 + MOVQ 168(R11), AX + SUBQ $0x20, R13 + SUBQ $0x04, AX + MOVQ 144(R11), R14 + + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 168(R11) + ORQ R14, R12 + + // exhausted = exhausted || (br3.off < 4) + CMPQ AX, $0x04 + SETLT AL + ORB AL, DL + +skip_fill3: + // val0 := br3.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v0 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br3.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R12 + ADDB CL, R13 + + // val1 := br3.peekTopBits(peekBits) + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 + + // v1 := table[val1&mask] + MOVW (R10)(R14*2), CX + + // br3.advance(uint8(v1.entry)) + MOVB CH, AH + SHLQ CL, R12 + ADDB CL, R13 + + // these two writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + MOVW AX, (R8) + + // update the bitreader structure + MOVQ R12, 176(R11) + MOVB R13, 184(R11) + ADDQ $0x02, SI + TESTB DL, DL + JZ main_loop + MOVQ ctx+0(FP), AX + SUBQ 16(AX), SI + SHLQ $0x02, SI + MOVQ SI, 40(AX) + RET + +// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) +TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8 + XORQ DX, DX + + // Preload values + MOVQ ctx+0(FP), CX + MOVBQZX 8(CX), DI + MOVQ 16(CX), BX + MOVQ 48(CX), SI + MOVQ 24(CX), R9 + MOVQ 32(CX), R10 + MOVQ (CX), R11 + + // Main loop +main_loop: + MOVQ BX, R8 + CMPQ R8, SI + SETGE DL + + // br0.fillFast32() + MOVQ 32(R11), R12 + MOVBQZX 40(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill0 + MOVQ 24(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ (R11), R15 + + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 24(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br0.off < 4) + CMPQ R14, $0x04 + SETLT AL + ORB AL, DL + +skip_fill0: + // val0 := br0.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v0 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br0.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R12 + ADDB CL, R13 + + // val1 := br0.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v1 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br0.advance(uint8(v1.entry) + MOVB CH, AH + SHLQ CL, R12 + ADDB CL, R13 + BSWAPL AX + + // val2 := br0.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v2 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br0.advance(uint8(v2.entry) + MOVB CH, AH + SHLQ CL, R12 + ADDB CL, R13 + + // val3 := br0.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v3 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br0.advance(uint8(v3.entry) + MOVB CH, AL + SHLQ CL, R12 + ADDB CL, R13 + BSWAPL AX + + // these four writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + // out[id * dstEvery + 3] = uint8(v2.entry >> 8) + // out[id * dstEvery + 4] = uint8(v3.entry >> 8) + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 32(R11) + MOVB R13, 40(R11) + ADDQ R9, R8 + + // br1.fillFast32() + MOVQ 80(R11), R12 + MOVBQZX 88(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill1 + MOVQ 72(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ 48(R11), R15 + + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 72(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br1.off < 4) + CMPQ R14, $0x04 + SETLT AL + ORB AL, DL + +skip_fill1: + // val0 := br1.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v0 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br1.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R12 + ADDB CL, R13 + + // val1 := br1.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v1 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br1.advance(uint8(v1.entry) + MOVB CH, AH + SHLQ CL, R12 + ADDB CL, R13 + BSWAPL AX + + // val2 := br1.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v2 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br1.advance(uint8(v2.entry) + MOVB CH, AH + SHLQ CL, R12 + ADDB CL, R13 + + // val3 := br1.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v3 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br1.advance(uint8(v3.entry) + MOVB CH, AL + SHLQ CL, R12 + ADDB CL, R13 + BSWAPL AX + + // these four writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + // out[id * dstEvery + 3] = uint8(v2.entry >> 8) + // out[id * dstEvery + 4] = uint8(v3.entry >> 8) + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 80(R11) + MOVB R13, 88(R11) + ADDQ R9, R8 + + // br2.fillFast32() + MOVQ 128(R11), R12 + MOVBQZX 136(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill2 + MOVQ 120(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ 96(R11), R15 + + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 120(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br2.off < 4) + CMPQ R14, $0x04 + SETLT AL + ORB AL, DL + +skip_fill2: + // val0 := br2.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v0 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br2.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R12 + ADDB CL, R13 + + // val1 := br2.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v1 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br2.advance(uint8(v1.entry) + MOVB CH, AH + SHLQ CL, R12 + ADDB CL, R13 + BSWAPL AX + + // val2 := br2.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v2 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br2.advance(uint8(v2.entry) + MOVB CH, AH + SHLQ CL, R12 + ADDB CL, R13 + + // val3 := br2.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v3 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br2.advance(uint8(v3.entry) + MOVB CH, AL + SHLQ CL, R12 + ADDB CL, R13 + BSWAPL AX + + // these four writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + // out[id * dstEvery + 3] = uint8(v2.entry >> 8) + // out[id * dstEvery + 4] = uint8(v3.entry >> 8) + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 128(R11) + MOVB R13, 136(R11) + ADDQ R9, R8 + + // br3.fillFast32() + MOVQ 176(R11), R12 + MOVBQZX 184(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill3 + MOVQ 168(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ 144(R11), R15 + + // b.value |= uint64(low) << (b.bitsRead & 63) + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 168(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br3.off < 4) + CMPQ R14, $0x04 + SETLT AL + ORB AL, DL + +skip_fill3: + // val0 := br3.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v0 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br3.advance(uint8(v0.entry) + MOVB CH, AL + SHLQ CL, R12 + ADDB CL, R13 + + // val1 := br3.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v1 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br3.advance(uint8(v1.entry) + MOVB CH, AH + SHLQ CL, R12 + ADDB CL, R13 + BSWAPL AX + + // val2 := br3.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v2 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br3.advance(uint8(v2.entry) + MOVB CH, AH + SHLQ CL, R12 + ADDB CL, R13 + + // val3 := br3.peekTopBits(peekBits) + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 + + // v3 := table[val0&mask] + MOVW (R10)(R14*2), CX + + // br3.advance(uint8(v3.entry) + MOVB CH, AL + SHLQ CL, R12 + ADDB CL, R13 + BSWAPL AX + + // these four writes get coalesced + // out[id * dstEvery + 0] = uint8(v0.entry >> 8) + // out[id * dstEvery + 1] = uint8(v1.entry >> 8) + // out[id * dstEvery + 3] = uint8(v2.entry >> 8) + // out[id * dstEvery + 4] = uint8(v3.entry >> 8) + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 176(R11) + MOVB R13, 184(R11) + ADDQ $0x04, BX + TESTB DL, DL + JZ main_loop + MOVQ ctx+0(FP), AX + SUBQ 16(AX), BX + SHLQ $0x02, BX + MOVQ BX, 40(AX) + RET + +// func decompress1x_main_loop_amd64(ctx *decompress1xContext) +TEXT ·decompress1x_main_loop_amd64(SB), $0-8 + MOVQ ctx+0(FP), CX + MOVQ 16(CX), DX + MOVQ 24(CX), BX + CMPQ BX, $0x04 + JB error_max_decoded_size_exeeded + LEAQ (DX)(BX*1), BX + MOVQ (CX), SI + MOVQ (SI), R8 + MOVQ 24(SI), R9 + MOVQ 32(SI), R10 + MOVBQZX 40(SI), R11 + MOVQ 32(CX), SI + MOVBQZX 8(CX), DI + JMP loop_condition + +main_loop: + // Check if we have room for 4 bytes in the output buffer + LEAQ 4(DX), CX + CMPQ CX, BX + JGE error_max_decoded_size_exeeded + + // Decode 4 values + CMPQ R11, $0x20 + JL bitReader_fillFast_1_end + SUBQ $0x20, R11 + SUBQ $0x04, R9 + MOVL (R8)(R9*1), R12 + MOVQ R11, CX + SHLQ CL, R12 + ORQ R12, R10 + +bitReader_fillFast_1_end: + MOVQ DI, CX + MOVQ R10, R12 + SHRQ CL, R12 + MOVW (SI)(R12*2), CX + MOVB CH, AL + MOVBQZX CL, CX + ADDQ CX, R11 + SHLQ CL, R10 + MOVQ DI, CX + MOVQ R10, R12 + SHRQ CL, R12 + MOVW (SI)(R12*2), CX + MOVB CH, AH + MOVBQZX CL, CX + ADDQ CX, R11 + SHLQ CL, R10 + BSWAPL AX + CMPQ R11, $0x20 + JL bitReader_fillFast_2_end + SUBQ $0x20, R11 + SUBQ $0x04, R9 + MOVL (R8)(R9*1), R12 + MOVQ R11, CX + SHLQ CL, R12 + ORQ R12, R10 + +bitReader_fillFast_2_end: + MOVQ DI, CX + MOVQ R10, R12 + SHRQ CL, R12 + MOVW (SI)(R12*2), CX + MOVB CH, AH + MOVBQZX CL, CX + ADDQ CX, R11 + SHLQ CL, R10 + MOVQ DI, CX + MOVQ R10, R12 + SHRQ CL, R12 + MOVW (SI)(R12*2), CX + MOVB CH, AL + MOVBQZX CL, CX + ADDQ CX, R11 + SHLQ CL, R10 + BSWAPL AX + + // Store the decoded values + MOVL AX, (DX) + ADDQ $0x04, DX + +loop_condition: + CMPQ R9, $0x08 + JGE main_loop + + // Update ctx structure + MOVQ ctx+0(FP), AX + SUBQ 16(AX), DX + MOVQ DX, 40(AX) + MOVQ (AX), AX + MOVQ R9, 24(AX) + MOVQ R10, 32(AX) + MOVB R11, 40(AX) + RET + + // Report error +error_max_decoded_size_exeeded: + MOVQ ctx+0(FP), AX + MOVQ $-1, CX + MOVQ CX, 40(AX) + RET + +// func decompress1x_main_loop_bmi2(ctx *decompress1xContext) +// Requires: BMI2 +TEXT ·decompress1x_main_loop_bmi2(SB), $0-8 + MOVQ ctx+0(FP), CX + MOVQ 16(CX), DX + MOVQ 24(CX), BX + CMPQ BX, $0x04 + JB error_max_decoded_size_exeeded + LEAQ (DX)(BX*1), BX + MOVQ (CX), SI + MOVQ (SI), R8 + MOVQ 24(SI), R9 + MOVQ 32(SI), R10 + MOVBQZX 40(SI), R11 + MOVQ 32(CX), SI + MOVBQZX 8(CX), DI + JMP loop_condition + +main_loop: + // Check if we have room for 4 bytes in the output buffer + LEAQ 4(DX), CX + CMPQ CX, BX + JGE error_max_decoded_size_exeeded + + // Decode 4 values + CMPQ R11, $0x20 + JL bitReader_fillFast_1_end + SUBQ $0x20, R11 + SUBQ $0x04, R9 + MOVL (R8)(R9*1), CX + SHLXQ R11, CX, CX + ORQ CX, R10 + +bitReader_fillFast_1_end: + SHRXQ DI, R10, CX + MOVW (SI)(CX*2), CX + MOVB CH, AL + MOVBQZX CL, CX + ADDQ CX, R11 + SHLXQ CX, R10, R10 + SHRXQ DI, R10, CX + MOVW (SI)(CX*2), CX + MOVB CH, AH + MOVBQZX CL, CX + ADDQ CX, R11 + SHLXQ CX, R10, R10 + BSWAPL AX + CMPQ R11, $0x20 + JL bitReader_fillFast_2_end + SUBQ $0x20, R11 + SUBQ $0x04, R9 + MOVL (R8)(R9*1), CX + SHLXQ R11, CX, CX + ORQ CX, R10 + +bitReader_fillFast_2_end: + SHRXQ DI, R10, CX + MOVW (SI)(CX*2), CX + MOVB CH, AH + MOVBQZX CL, CX + ADDQ CX, R11 + SHLXQ CX, R10, R10 + SHRXQ DI, R10, CX + MOVW (SI)(CX*2), CX + MOVB CH, AL + MOVBQZX CL, CX + ADDQ CX, R11 + SHLXQ CX, R10, R10 + BSWAPL AX + + // Store the decoded values + MOVL AX, (DX) + ADDQ $0x04, DX + +loop_condition: + CMPQ R9, $0x08 + JGE main_loop + + // Update ctx structure + MOVQ ctx+0(FP), AX + SUBQ 16(AX), DX + MOVQ DX, 40(AX) + MOVQ (AX), AX + MOVQ R9, 24(AX) + MOVQ R10, 32(AX) + MOVB R11, 40(AX) + RET + + // Report error +error_max_decoded_size_exeeded: + MOVQ ctx+0(FP), AX + MOVQ $-1, CX + MOVQ CX, 40(AX) + RET diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go new file mode 100644 index 000000000000..908c17de63fc --- /dev/null +++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go @@ -0,0 +1,299 @@ +//go:build !amd64 || appengine || !gc || noasm +// +build !amd64 appengine !gc noasm + +// This file contains a generic implementation of Decoder.Decompress4X. +package huff0 + +import ( + "errors" + "fmt" +) + +// Decompress4X will decompress a 4X encoded stream. +// The length of the supplied input must match the end of a block exactly. +// The *capacity* of the dst slice must match the destination size of +// the uncompressed data exactly. +func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { + if len(d.dt.single) == 0 { + return nil, errors.New("no table loaded") + } + if len(src) < 6+(4*1) { + return nil, errors.New("input too small") + } + if use8BitTables && d.actualTableLog <= 8 { + return d.decompress4X8bit(dst, src) + } + + var br [4]bitReaderShifted + // Decode "jump table" + start := 6 + for i := 0; i < 3; i++ { + length := int(src[i*2]) | (int(src[i*2+1]) << 8) + if start+length >= len(src) { + return nil, errors.New("truncated input (or invalid offset)") + } + err := br[i].init(src[start : start+length]) + if err != nil { + return nil, err + } + start += length + } + err := br[3].init(src[start:]) + if err != nil { + return nil, err + } + + // destination, offset to match first output + dstSize := cap(dst) + dst = dst[:dstSize] + out := dst + dstEvery := (dstSize + 3) / 4 + + const tlSize = 1 << tableLogMax + const tlMask = tlSize - 1 + single := d.dt.single[:tlSize] + + // Use temp table to avoid bound checks/append penalty. + buf := d.buffer() + var off uint8 + var decoded int + + // Decode 2 values from each decoder/loop. + const bufoff = 256 + for { + if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 { + break + } + + { + const stream = 0 + const stream2 = 1 + br[stream].fillFast() + br[stream2].fillFast() + + val := br[stream].peekBitsFast(d.actualTableLog) + val2 := br[stream2].peekBitsFast(d.actualTableLog) + v := single[val&tlMask] + v2 := single[val2&tlMask] + br[stream].advance(uint8(v.entry)) + br[stream2].advance(uint8(v2.entry)) + buf[stream][off] = uint8(v.entry >> 8) + buf[stream2][off] = uint8(v2.entry >> 8) + + val = br[stream].peekBitsFast(d.actualTableLog) + val2 = br[stream2].peekBitsFast(d.actualTableLog) + v = single[val&tlMask] + v2 = single[val2&tlMask] + br[stream].advance(uint8(v.entry)) + br[stream2].advance(uint8(v2.entry)) + buf[stream][off+1] = uint8(v.entry >> 8) + buf[stream2][off+1] = uint8(v2.entry >> 8) + } + + { + const stream = 2 + const stream2 = 3 + br[stream].fillFast() + br[stream2].fillFast() + + val := br[stream].peekBitsFast(d.actualTableLog) + val2 := br[stream2].peekBitsFast(d.actualTableLog) + v := single[val&tlMask] + v2 := single[val2&tlMask] + br[stream].advance(uint8(v.entry)) + br[stream2].advance(uint8(v2.entry)) + buf[stream][off] = uint8(v.entry >> 8) + buf[stream2][off] = uint8(v2.entry >> 8) + + val = br[stream].peekBitsFast(d.actualTableLog) + val2 = br[stream2].peekBitsFast(d.actualTableLog) + v = single[val&tlMask] + v2 = single[val2&tlMask] + br[stream].advance(uint8(v.entry)) + br[stream2].advance(uint8(v2.entry)) + buf[stream][off+1] = uint8(v.entry >> 8) + buf[stream2][off+1] = uint8(v2.entry >> 8) + } + + off += 2 + + if off == 0 { + if bufoff > dstEvery { + d.bufs.Put(buf) + return nil, errors.New("corruption detected: stream overrun 1") + } + // There must at least be 3 buffers left. + if len(out)-bufoff < dstEvery*3 { + d.bufs.Put(buf) + return nil, errors.New("corruption detected: stream overrun 2") + } + //copy(out, buf[0][:]) + //copy(out[dstEvery:], buf[1][:]) + //copy(out[dstEvery*2:], buf[2][:]) + //copy(out[dstEvery*3:], buf[3][:]) + *(*[bufoff]byte)(out) = buf[0] + *(*[bufoff]byte)(out[dstEvery:]) = buf[1] + *(*[bufoff]byte)(out[dstEvery*2:]) = buf[2] + *(*[bufoff]byte)(out[dstEvery*3:]) = buf[3] + out = out[bufoff:] + decoded += bufoff * 4 + } + } + if off > 0 { + ioff := int(off) + if len(out) < dstEvery*3+ioff { + d.bufs.Put(buf) + return nil, errors.New("corruption detected: stream overrun 3") + } + copy(out, buf[0][:off]) + copy(out[dstEvery:], buf[1][:off]) + copy(out[dstEvery*2:], buf[2][:off]) + copy(out[dstEvery*3:], buf[3][:off]) + decoded += int(off) * 4 + out = out[off:] + } + + // Decode remaining. + remainBytes := dstEvery - (decoded / 4) + for i := range br { + offset := dstEvery * i + endsAt := offset + remainBytes + if endsAt > len(out) { + endsAt = len(out) + } + br := &br[i] + bitsLeft := br.remaining() + for bitsLeft > 0 { + br.fill() + if offset >= endsAt { + d.bufs.Put(buf) + return nil, errors.New("corruption detected: stream overrun 4") + } + + // Read value and increment offset. + val := br.peekBitsFast(d.actualTableLog) + v := single[val&tlMask].entry + nBits := uint8(v) + br.advance(nBits) + bitsLeft -= uint(nBits) + out[offset] = uint8(v >> 8) + offset++ + } + if offset != endsAt { + d.bufs.Put(buf) + return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt) + } + decoded += offset - dstEvery*i + err = br.close() + if err != nil { + return nil, err + } + } + d.bufs.Put(buf) + if dstSize != decoded { + return nil, errors.New("corruption detected: short output block") + } + return dst, nil +} + +// Decompress1X will decompress a 1X encoded stream. +// The cap of the output buffer will be the maximum decompressed size. +// The length of the supplied input must match the end of a block exactly. +func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) { + if len(d.dt.single) == 0 { + return nil, errors.New("no table loaded") + } + if use8BitTables && d.actualTableLog <= 8 { + return d.decompress1X8Bit(dst, src) + } + var br bitReaderShifted + err := br.init(src) + if err != nil { + return dst, err + } + maxDecodedSize := cap(dst) + dst = dst[:0] + + // Avoid bounds check by always having full sized table. + const tlSize = 1 << tableLogMax + const tlMask = tlSize - 1 + dt := d.dt.single[:tlSize] + + // Use temp table to avoid bound checks/append penalty. + bufs := d.buffer() + buf := &bufs[0] + var off uint8 + + for br.off >= 8 { + br.fillFast() + v := dt[br.peekBitsFast(d.actualTableLog)&tlMask] + br.advance(uint8(v.entry)) + buf[off+0] = uint8(v.entry >> 8) + + v = dt[br.peekBitsFast(d.actualTableLog)&tlMask] + br.advance(uint8(v.entry)) + buf[off+1] = uint8(v.entry >> 8) + + // Refill + br.fillFast() + + v = dt[br.peekBitsFast(d.actualTableLog)&tlMask] + br.advance(uint8(v.entry)) + buf[off+2] = uint8(v.entry >> 8) + + v = dt[br.peekBitsFast(d.actualTableLog)&tlMask] + br.advance(uint8(v.entry)) + buf[off+3] = uint8(v.entry >> 8) + + off += 4 + if off == 0 { + if len(dst)+256 > maxDecodedSize { + br.close() + d.bufs.Put(bufs) + return nil, ErrMaxDecodedSizeExceeded + } + dst = append(dst, buf[:]...) + } + } + + if len(dst)+int(off) > maxDecodedSize { + d.bufs.Put(bufs) + br.close() + return nil, ErrMaxDecodedSizeExceeded + } + dst = append(dst, buf[:off]...) + + // br < 8, so uint8 is fine + bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead + for bitsLeft > 0 { + br.fill() + if false && br.bitsRead >= 32 { + if br.off >= 4 { + v := br.in[br.off-4:] + v = v[:4] + low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) + br.value = (br.value << 32) | uint64(low) + br.bitsRead -= 32 + br.off -= 4 + } else { + for br.off > 0 { + br.value = (br.value << 8) | uint64(br.in[br.off-1]) + br.bitsRead -= 8 + br.off-- + } + } + } + if len(dst) >= maxDecodedSize { + d.bufs.Put(bufs) + br.close() + return nil, ErrMaxDecodedSizeExceeded + } + v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask] + nBits := uint8(v.entry) + br.advance(nBits) + bitsLeft -= nBits + dst = append(dst, uint8(v.entry>>8)) + } + d.bufs.Put(bufs) + return dst, br.close() +} diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go index 3ee00ecb470a..e8ad17ad08ef 100644 --- a/vendor/github.com/klauspost/compress/huff0/huff0.go +++ b/vendor/github.com/klauspost/compress/huff0/huff0.go @@ -8,6 +8,7 @@ import ( "fmt" "math" "math/bits" + "sync" "github.com/klauspost/compress/fse" ) @@ -116,6 +117,7 @@ type Scratch struct { nodes []nodeElt tmpOut [4][]byte fse *fse.Scratch + decPool sync.Pool // *[4][256]byte buffers. huffWeight [maxSymbolValue + 1]byte } diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go new file mode 100644 index 000000000000..3954c51219b2 --- /dev/null +++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go @@ -0,0 +1,34 @@ +// Package cpuinfo gives runtime info about the current CPU. +// +// This is a very limited module meant for use internally +// in this project. For more versatile solution check +// https://github.com/klauspost/cpuid. +package cpuinfo + +// HasBMI1 checks whether an x86 CPU supports the BMI1 extension. +func HasBMI1() bool { + return hasBMI1 +} + +// HasBMI2 checks whether an x86 CPU supports the BMI2 extension. +func HasBMI2() bool { + return hasBMI2 +} + +// DisableBMI2 will disable BMI2, for testing purposes. +// Call returned function to restore previous state. +func DisableBMI2() func() { + old := hasBMI2 + hasBMI2 = false + return func() { + hasBMI2 = old + } +} + +// HasBMI checks whether an x86 CPU supports both BMI1 and BMI2 extensions. +func HasBMI() bool { + return HasBMI1() && HasBMI2() +} + +var hasBMI1 bool +var hasBMI2 bool diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go new file mode 100644 index 000000000000..e802579c4f96 --- /dev/null +++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go @@ -0,0 +1,11 @@ +//go:build amd64 && !appengine && !noasm && gc +// +build amd64,!appengine,!noasm,gc + +package cpuinfo + +// go:noescape +func x86extensions() (bmi1, bmi2 bool) + +func init() { + hasBMI1, hasBMI2 = x86extensions() +} diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s new file mode 100644 index 000000000000..4465fbe9e905 --- /dev/null +++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s @@ -0,0 +1,36 @@ +// +build !appengine +// +build gc +// +build !noasm + +#include "textflag.h" +#include "funcdata.h" +#include "go_asm.h" + +TEXT ·x86extensions(SB), NOSPLIT, $0 + // 1. determine max EAX value + XORQ AX, AX + CPUID + + CMPQ AX, $7 + JB unsupported + + // 2. EAX = 7, ECX = 0 --- see Table 3-8 "Information Returned by CPUID Instruction" + MOVQ $7, AX + MOVQ $0, CX + CPUID + + BTQ $3, BX // bit 3 = BMI1 + SETCS AL + + BTQ $8, BX // bit 8 = BMI2 + SETCS AH + + MOVB AL, bmi1+0(FP) + MOVB AH, bmi2+1(FP) + RET + +unsupported: + XORQ AX, AX + MOVB AL, bmi1+0(FP) + MOVB AL, bmi2+1(FP) + RET diff --git a/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go b/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go index 511bba65db8f..298c4f8e97da 100644 --- a/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go +++ b/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go @@ -18,6 +18,7 @@ func load64(b []byte, i int) uint64 { // emitLiteral writes a literal chunk and returns the number of bytes written. // // It assumes that: +// // dst is long enough to hold the encoded bytes // 1 <= len(lit) && len(lit) <= 65536 func emitLiteral(dst, lit []byte) int { @@ -42,6 +43,7 @@ func emitLiteral(dst, lit []byte) int { // emitCopy writes a copy chunk and returns the number of bytes written. // // It assumes that: +// // dst is long enough to hold the encoded bytes // 1 <= offset && offset <= 65535 // 4 <= length && length <= 65535 @@ -89,6 +91,7 @@ func emitCopy(dst []byte, offset, length int) int { // src[i:i+k-j] and src[j:k] have the same contents. // // It assumes that: +// // 0 <= i && i < j && j <= len(src) func extendMatch(src []byte, i, j int) int { for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 { @@ -105,8 +108,9 @@ func hash(u, shift uint32) uint32 { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlock(dst, src []byte) (d int) { // Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive. // The table element type is uint16, as s < sLimit and sLimit < len(src) diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md index e6716aeaee99..1d80c42a530b 100644 --- a/vendor/github.com/klauspost/compress/s2/README.md +++ b/vendor/github.com/klauspost/compress/s2/README.md @@ -19,6 +19,7 @@ This is important, so you don't have to worry about spending CPU cycles on alrea * Adjustable compression (3 levels) * Concurrent stream compression * Faster decompression, even for Snappy compatible content +* Concurrent Snappy/S2 stream decompression * Ability to quickly skip forward in compressed stream * Random seeking with indexes * Compatible with reading Snappy compressed content @@ -324,35 +325,35 @@ The content compressed in this mode is fully compatible with the standard decode Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all threads and a single thread (1 CPU): -| File | S2 speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller | -|-----------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------| -| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 12.70x | 10556 MB/s | 7.35% | 4.15x | 3455 MB/s | 12.79% | -| (1 CPU) | 1.14x | 948 MB/s | - | 0.42x | 349 MB/s | - | -| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 17.13x | 14484 MB/s | 31.60% | 10.09x | 8533 MB/s | 37.71% | -| (1 CPU) | 1.33x | 1127 MB/s | - | 0.70x | 589 MB/s | - | -| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 15.14x | 12000 MB/s | -5.79% | 6.59x | 5223 MB/s | 5.80% | -| (1 CPU) | 1.11x | 877 MB/s | - | 0.47x | 370 MB/s | - | -| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 14.62x | 12116 MB/s | 15.90% | 5.35x | 4430 MB/s | 16.08% | -| (1 CPU) | 1.38x | 1146 MB/s | - | 0.38x | 312 MB/s | - | -| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst) | 8.83x | 17579 MB/s | 43.86% | 6.54x | 13011 MB/s | 47.23% | -| (1 CPU) | 1.14x | 2259 MB/s | - | 0.74x | 1475 MB/s | - | -| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 16.72x | 14019 MB/s | 24.02% | 10.11x | 8477 MB/s | 30.48% | -| (1 CPU) | 1.24x | 1043 MB/s | - | 0.70x | 586 MB/s | - | -| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 13.33x | 9254 MB/s | 1.84% | 6.75x | 4686 MB/s | 6.72% | -| (1 CPU) | 0.97x | 672 MB/s | - | 0.53x | 366 MB/s | - | -| sharnd.out.2gb | 2.11x | 12639 MB/s | 0.01% | 1.98x | 11833 MB/s | 0.01% | -| (1 CPU) | 0.93x | 5594 MB/s | - | 1.34x | 8030 MB/s | - | -| [enwik9](http://mattmahoney.net/dc/textdata.html) | 19.34x | 8220 MB/s | 3.98% | 7.87x | 3345 MB/s | 15.82% | -| (1 CPU) | 1.06x | 452 MB/s | - | 0.50x | 213 MB/s | - | -| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 10.48x | 6124 MB/s | 5.67% | 3.76x | 2197 MB/s | 12.60% | -| (1 CPU) | 0.97x | 568 MB/s | - | 0.46x | 271 MB/s | - | -| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results) | 21.07x | 9020 MB/s | 6.36% | 6.91x | 2959 MB/s | 16.95% | -| (1 CPU) | 1.07x | 460 MB/s | - | 0.51x | 220 MB/s | - | +| File | S2 Speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller | +|---------------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------| +| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 16.33x | 10556 MB/s | 8.0% | 6.04x | 5252 MB/s | 14.7% | +| (1 CPU) | 1.08x | 940 MB/s | - | 0.46x | 400 MB/s | - | +| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 16.51x | 15224 MB/s | 31.70% | 9.47x | 8734 MB/s | 37.71% | +| (1 CPU) | 1.26x | 1157 MB/s | - | 0.60x | 556 MB/s | - | +| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 15.14x | 12598 MB/s | -5.76% | 6.23x | 5675 MB/s | 3.62% | +| (1 CPU) | 1.02x | 932 MB/s | - | 0.47x | 432 MB/s | - | +| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 11.21x | 12116 MB/s | 15.95% | 3.24x | 3500 MB/s | 18.00% | +| (1 CPU) | 1.05x | 1135 MB/s | - | 0.27x | 292 MB/s | - | +| [apache.log](https://files.klauspost.com/compress/apache.log.zst) | 8.55x | 16673 MB/s | 20.54% | 5.85x | 11420 MB/s | 24.97% | +| (1 CPU) | 1.91x | 1771 MB/s | - | 0.53x | 1041 MB/s | - | +| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 15.76x | 14357 MB/s | 24.01% | 8.67x | 7891 MB/s | 33.68% | +| (1 CPU) | 1.17x | 1064 MB/s | - | 0.65x | 595 MB/s | - | +| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 13.33x | 9835 MB/s | 2.34% | 6.85x | 4863 MB/s | 9.96% | +| (1 CPU) | 0.97x | 689 MB/s | - | 0.55x | 387 MB/s | - | +| sharnd.out.2gb | 9.11x | 13213 MB/s | 0.01% | 1.49x | 9184 MB/s | 0.01% | +| (1 CPU) | 0.88x | 5418 MB/s | - | 0.77x | 5417 MB/s | - | +| [sofia-air-quality-dataset csv](https://files.klauspost.com/compress/sofia-air-quality-dataset.tar.zst) | 22.00x | 11477 MB/s | 18.73% | 11.15x | 5817 MB/s | 27.88% | +| (1 CPU) | 1.23x | 642 MB/s | - | 0.71x | 642 MB/s | - | +| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 11.23x | 6520 MB/s | 5.9% | 5.35x | 3109 MB/s | 15.88% | +| (1 CPU) | 1.05x | 607 MB/s | - | 0.52x | 304 MB/s | - | +| [enwik9](https://files.klauspost.com/compress/enwik9.zst) | 19.28x | 8440 MB/s | 4.04% | 9.31x | 4076 MB/s | 18.04% | +| (1 CPU) | 1.12x | 488 MB/s | - | 0.57x | 250 MB/s | - | ### Legend -* `S2 speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core. -* `S2 throughput`: Throughput of S2 in MB/s. +* `S2 Speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core. +* `S2 Throughput`: Throughput of S2 in MB/s. * `S2 % smaller`: How many percent of the Snappy output size is S2 better. * `S2 "better"`: Speed when enabling "better" compression mode in S2 compared to Snappy. * `"better" throughput`: Speed when enabling "better" compression mode in S2 compared to Snappy. @@ -360,7 +361,7 @@ Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all th There is a good speedup across the board when using a single thread and a significant speedup when using multiple threads. -Machine generated data gets by far the biggest compression boost, with size being being reduced by up to 45% of Snappy size. +Machine generated data gets by far the biggest compression boost, with size being reduced by up to 35% of Snappy size. The "better" compression mode sees a good improvement in all cases, but usually at a performance cost. @@ -403,18 +404,37 @@ The "better" compression mode will actively look for shorter matches, which is w Without assembly decompression is also very fast; single goroutine decompression speed. No assembly: | File | S2 Throughput | S2 throughput | -|--------------------------------|--------------|---------------| -| consensus.db.10gb.s2 | 1.84x | 2289.8 MB/s | -| 10gb.tar.s2 | 1.30x | 867.07 MB/s | -| rawstudio-mint14.tar.s2 | 1.66x | 1329.65 MB/s | -| github-june-2days-2019.json.s2 | 2.36x | 1831.59 MB/s | -| github-ranks-backup.bin.s2 | 1.73x | 1390.7 MB/s | -| enwik9.s2 | 1.67x | 681.53 MB/s | -| adresser.json.s2 | 3.41x | 4230.53 MB/s | -| silesia.tar.s2 | 1.52x | 811.58 | +|--------------------------------|---------------|---------------| +| consensus.db.10gb.s2 | 1.84x | 2289.8 MB/s | +| 10gb.tar.s2 | 1.30x | 867.07 MB/s | +| rawstudio-mint14.tar.s2 | 1.66x | 1329.65 MB/s | +| github-june-2days-2019.json.s2 | 2.36x | 1831.59 MB/s | +| github-ranks-backup.bin.s2 | 1.73x | 1390.7 MB/s | +| enwik9.s2 | 1.67x | 681.53 MB/s | +| adresser.json.s2 | 3.41x | 4230.53 MB/s | +| silesia.tar.s2 | 1.52x | 811.58 | Even though S2 typically compresses better than Snappy, decompression speed is always better. +### Concurrent Stream Decompression + +For full stream decompression S2 offers a [DecodeConcurrent](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.DecodeConcurrent) +that will decode a full stream using multiple goroutines. + +Example scaling, AMD Ryzen 3950X, 16 cores, decompression using `s2d -bench=3 `, best of 3: + +| Input | `-cpu=1` | `-cpu=2` | `-cpu=4` | `-cpu=8` | `-cpu=16` | +|-------------------------------------------|------------|------------|------------|------------|-------------| +| enwik10.snappy | 1098.6MB/s | 1819.8MB/s | 3625.6MB/s | 6910.6MB/s | 10818.2MB/s | +| enwik10.s2 | 1303.5MB/s | 2606.1MB/s | 4847.9MB/s | 8878.4MB/s | 9592.1MB/s | +| sofia-air-quality-dataset.tar.snappy | 1302.0MB/s | 2165.0MB/s | 4244.5MB/s | 8241.0MB/s | 12920.5MB/s | +| sofia-air-quality-dataset.tar.s2 | 1399.2MB/s | 2463.2MB/s | 5196.5MB/s | 9639.8MB/s | 11439.5MB/s | +| sofia-air-quality-dataset.tar.s2 (no asm) | 837.5MB/s | 1652.6MB/s | 3183.6MB/s | 5945.0MB/s | 9620.7MB/s | + +Scaling can be expected to be pretty linear until memory bandwidth is saturated. + +For now the DecodeConcurrent can only be used for full streams without seeking or combining with regular reads. + ## Block compression @@ -430,14 +450,14 @@ The most reliable is a wide dataset. For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z), 53927 files, total input size: 4,014,735,833 bytes. Single goroutine used. -| * | Input | Output | Reduction | MB/s | -|-------------------|------------|------------|-----------|--------| -| S2 | 4014735833 | 1059723369 | 73.60% | **934.34** | -| S2 Better | 4014735833 | 969670507 | 75.85% | 532.70 | -| S2 Best | 4014735833 | 906625668 | **77.85%** | 46.84 | -| Snappy | 4014735833 | 1128706759 | 71.89% | 762.59 | -| S2, Snappy Output | 4014735833 | 1093821420 | 72.75% | 908.60 | -| LZ4 | 4014735833 | 1079259294 | 73.12% | 526.94 | +| * | Input | Output | Reduction | MB/s | +|-------------------|------------|------------|------------|------------| +| S2 | 4014735833 | 1059723369 | 73.60% | **936.73** | +| S2 Better | 4014735833 | 961580539 | 76.05% | 451.10 | +| S2 Best | 4014735833 | 899182886 | **77.60%** | 46.84 | +| Snappy | 4014735833 | 1128706759 | 71.89% | 790.15 | +| S2, Snappy Output | 4014735833 | 1093823291 | 72.75% | 936.60 | +| LZ4 | 4014735833 | 1063768713 | 73.50% | 452.02 | S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best". "Better" mode provides the same compression speed as LZ4 with better compression ratio. @@ -469,42 +489,23 @@ AMD64 assembly is use for both S2 and Snappy. | Absolute Perf | Snappy size | S2 Size | Snappy Speed | S2 Speed | Snappy dec | S2 dec | |-----------------------|-------------|---------|--------------|-------------|-------------|-------------| -| html | 22843 | 21111 | 16246 MB/s | 17438 MB/s | 40972 MB/s | 49263 MB/s | -| urls.10K | 335492 | 287326 | 7943 MB/s | 9693 MB/s | 22523 MB/s | 26484 MB/s | -| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 273889 MB/s | 718321 MB/s | 827552 MB/s | -| fireworks.jpeg (200B) | 146 | 155 | 8869 MB/s | 17773 MB/s | 33691 MB/s | 52421 MB/s | -| paper-100k.pdf | 85304 | 84459 | 167546 MB/s | 101263 MB/s | 326905 MB/s | 291944 MB/s | -| html_x_4 | 92234 | 21113 | 15194 MB/s | 50670 MB/s | 30843 MB/s | 32217 MB/s | -| alice29.txt | 88034 | 85975 | 5936 MB/s | 6139 MB/s | 12882 MB/s | 20044 MB/s | -| asyoulik.txt | 77503 | 79650 | 5517 MB/s | 6366 MB/s | 12735 MB/s | 22806 MB/s | -| lcet10.txt | 234661 | 220670 | 6235 MB/s | 6067 MB/s | 14519 MB/s | 18697 MB/s | -| plrabn12.txt | 319267 | 317985 | 5159 MB/s | 5726 MB/s | 11923 MB/s | 19901 MB/s | -| geo.protodata | 23335 | 18690 | 21220 MB/s | 26529 MB/s | 56271 MB/s | 62540 MB/s | -| kppkn.gtb | 69526 | 65312 | 9732 MB/s | 8559 MB/s | 18491 MB/s | 18969 MB/s | -| alice29.txt (128B) | 80 | 82 | 6691 MB/s | 15489 MB/s | 31883 MB/s | 38874 MB/s | -| alice29.txt (1000B) | 774 | 774 | 12204 MB/s | 13000 MB/s | 48056 MB/s | 52341 MB/s | -| alice29.txt (10000B) | 6648 | 6933 | 10044 MB/s | 12806 MB/s | 32378 MB/s | 46322 MB/s | -| alice29.txt (20000B) | 12686 | 13574 | 7733 MB/s | 11210 MB/s | 30566 MB/s | 58969 MB/s | - - -| Relative Perf | Snappy size | S2 size improved | S2 Speed | S2 Dec Speed | -|-----------------------|-------------|------------------|----------|--------------| -| html | 22.31% | 7.58% | 1.07x | 1.20x | -| urls.10K | 47.78% | 14.36% | 1.22x | 1.18x | -| fireworks.jpeg | 99.95% | -0.05% | 0.78x | 1.15x | -| fireworks.jpeg (200B) | 73.00% | -6.16% | 2.00x | 1.56x | -| paper-100k.pdf | 83.30% | 0.99% | 0.60x | 0.89x | -| html_x_4 | 22.52% | 77.11% | 3.33x | 1.04x | -| alice29.txt | 57.88% | 2.34% | 1.03x | 1.56x | -| asyoulik.txt | 61.91% | -2.77% | 1.15x | 1.79x | -| lcet10.txt | 54.99% | 5.96% | 0.97x | 1.29x | -| plrabn12.txt | 66.26% | 0.40% | 1.11x | 1.67x | -| geo.protodata | 19.68% | 19.91% | 1.25x | 1.11x | -| kppkn.gtb | 37.72% | 6.06% | 0.88x | 1.03x | -| alice29.txt (128B) | 62.50% | -2.50% | 2.31x | 1.22x | -| alice29.txt (1000B) | 77.40% | 0.00% | 1.07x | 1.09x | -| alice29.txt (10000B) | 66.48% | -4.29% | 1.27x | 1.43x | -| alice29.txt (20000B) | 63.43% | -7.00% | 1.45x | 1.93x | +| html | 22843 | 20868 | 16246 MB/s | 18617 MB/s | 40972 MB/s | 49263 MB/s | +| urls.10K | 335492 | 286541 | 7943 MB/s | 10201 MB/s | 22523 MB/s | 26484 MB/s | +| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 303228 MB/s | 718321 MB/s | 827552 MB/s | +| fireworks.jpeg (200B) | 146 | 155 | 8869 MB/s | 20180 MB/s | 33691 MB/s | 52421 MB/s | +| paper-100k.pdf | 85304 | 84202 | 167546 MB/s | 112988 MB/s | 326905 MB/s | 291944 MB/s | +| html_x_4 | 92234 | 20870 | 15194 MB/s | 54457 MB/s | 30843 MB/s | 32217 MB/s | +| alice29.txt | 88034 | 85934 | 5936 MB/s | 6540 MB/s | 12882 MB/s | 20044 MB/s | +| asyoulik.txt | 77503 | 79575 | 5517 MB/s | 6657 MB/s | 12735 MB/s | 22806 MB/s | +| lcet10.txt | 234661 | 220383 | 6235 MB/s | 6303 MB/s | 14519 MB/s | 18697 MB/s | +| plrabn12.txt | 319267 | 318196 | 5159 MB/s | 6074 MB/s | 11923 MB/s | 19901 MB/s | +| geo.protodata | 23335 | 18606 | 21220 MB/s | 25432 MB/s | 56271 MB/s | 62540 MB/s | +| kppkn.gtb | 69526 | 65019 | 9732 MB/s | 8905 MB/s | 18491 MB/s | 18969 MB/s | +| alice29.txt (128B) | 80 | 82 | 6691 MB/s | 17179 MB/s | 31883 MB/s | 38874 MB/s | +| alice29.txt (1000B) | 774 | 774 | 12204 MB/s | 13273 MB/s | 48056 MB/s | 52341 MB/s | +| alice29.txt (10000B) | 6648 | 6933 | 10044 MB/s | 12824 MB/s | 32378 MB/s | 46322 MB/s | +| alice29.txt (20000B) | 12686 | 13516 | 7733 MB/s | 12160 MB/s | 30566 MB/s | 58969 MB/s | + Speed is generally at or above Snappy. Small blocks gets a significant speedup, although at the expense of size. @@ -523,42 +524,23 @@ So individual benchmarks should only be seen as a guideline and the overall pict | Absolute Perf | Snappy size | Better Size | Snappy Speed | Better Speed | Snappy dec | Better dec | |-----------------------|-------------|-------------|--------------|--------------|-------------|-------------| -| html | 22843 | 19833 | 16246 MB/s | 7731 MB/s | 40972 MB/s | 40292 MB/s | -| urls.10K | 335492 | 253529 | 7943 MB/s | 3980 MB/s | 22523 MB/s | 20981 MB/s | -| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 9760 MB/s | 718321 MB/s | 823698 MB/s | -| fireworks.jpeg (200B) | 146 | 142 | 8869 MB/s | 594 MB/s | 33691 MB/s | 30101 MB/s | -| paper-100k.pdf | 85304 | 82915 | 167546 MB/s | 7470 MB/s | 326905 MB/s | 198869 MB/s | -| html_x_4 | 92234 | 19841 | 15194 MB/s | 23403 MB/s | 30843 MB/s | 30937 MB/s | -| alice29.txt | 88034 | 73218 | 5936 MB/s | 2945 MB/s | 12882 MB/s | 16611 MB/s | -| asyoulik.txt | 77503 | 66844 | 5517 MB/s | 2739 MB/s | 12735 MB/s | 14975 MB/s | -| lcet10.txt | 234661 | 190589 | 6235 MB/s | 3099 MB/s | 14519 MB/s | 16634 MB/s | -| plrabn12.txt | 319267 | 270828 | 5159 MB/s | 2600 MB/s | 11923 MB/s | 13382 MB/s | -| geo.protodata | 23335 | 18278 | 21220 MB/s | 11208 MB/s | 56271 MB/s | 57961 MB/s | -| kppkn.gtb | 69526 | 61851 | 9732 MB/s | 4556 MB/s | 18491 MB/s | 16524 MB/s | -| alice29.txt (128B) | 80 | 81 | 6691 MB/s | 529 MB/s | 31883 MB/s | 34225 MB/s | -| alice29.txt (1000B) | 774 | 748 | 12204 MB/s | 1943 MB/s | 48056 MB/s | 42068 MB/s | -| alice29.txt (10000B) | 6648 | 6234 | 10044 MB/s | 2949 MB/s | 32378 MB/s | 28813 MB/s | -| alice29.txt (20000B) | 12686 | 11584 | 7733 MB/s | 2822 MB/s | 30566 MB/s | 27315 MB/s | - - -| Relative Perf | Snappy size | Better size | Better Speed | Better dec | -|-----------------------|-------------|-------------|--------------|------------| -| html | 22.31% | 13.18% | 0.48x | 0.98x | -| urls.10K | 47.78% | 24.43% | 0.50x | 0.93x | -| fireworks.jpeg | 99.95% | -0.05% | 0.03x | 1.15x | -| fireworks.jpeg (200B) | 73.00% | 2.74% | 0.07x | 0.89x | -| paper-100k.pdf | 83.30% | 2.80% | 0.07x | 0.61x | -| html_x_4 | 22.52% | 78.49% | 0.04x | 1.00x | -| alice29.txt | 57.88% | 16.83% | 1.54x | 1.29x | -| asyoulik.txt | 61.91% | 13.75% | 0.50x | 1.18x | -| lcet10.txt | 54.99% | 18.78% | 0.50x | 1.15x | -| plrabn12.txt | 66.26% | 15.17% | 0.50x | 1.12x | -| geo.protodata | 19.68% | 21.67% | 0.50x | 1.03x | -| kppkn.gtb | 37.72% | 11.04% | 0.53x | 0.89x | -| alice29.txt (128B) | 62.50% | -1.25% | 0.47x | 1.07x | -| alice29.txt (1000B) | 77.40% | 3.36% | 0.08x | 0.88x | -| alice29.txt (10000B) | 66.48% | 6.23% | 0.16x | 0.89x | -| alice29.txt (20000B) | 63.43% | 8.69% | 0.29x | 0.89x | +| html | 22843 | 18972 | 16246 MB/s | 8621 MB/s | 40972 MB/s | 40292 MB/s | +| urls.10K | 335492 | 248079 | 7943 MB/s | 5104 MB/s | 22523 MB/s | 20981 MB/s | +| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 84429 MB/s | 718321 MB/s | 823698 MB/s | +| fireworks.jpeg (200B) | 146 | 149 | 8869 MB/s | 7125 MB/s | 33691 MB/s | 30101 MB/s | +| paper-100k.pdf | 85304 | 82887 | 167546 MB/s | 11087 MB/s | 326905 MB/s | 198869 MB/s | +| html_x_4 | 92234 | 18982 | 15194 MB/s | 29316 MB/s | 30843 MB/s | 30937 MB/s | +| alice29.txt | 88034 | 71611 | 5936 MB/s | 3709 MB/s | 12882 MB/s | 16611 MB/s | +| asyoulik.txt | 77503 | 65941 | 5517 MB/s | 3380 MB/s | 12735 MB/s | 14975 MB/s | +| lcet10.txt | 234661 | 184939 | 6235 MB/s | 3537 MB/s | 14519 MB/s | 16634 MB/s | +| plrabn12.txt | 319267 | 264990 | 5159 MB/s | 2960 MB/s | 11923 MB/s | 13382 MB/s | +| geo.protodata | 23335 | 17689 | 21220 MB/s | 10859 MB/s | 56271 MB/s | 57961 MB/s | +| kppkn.gtb | 69526 | 55398 | 9732 MB/s | 5206 MB/s | 18491 MB/s | 16524 MB/s | +| alice29.txt (128B) | 80 | 78 | 6691 MB/s | 7422 MB/s | 31883 MB/s | 34225 MB/s | +| alice29.txt (1000B) | 774 | 746 | 12204 MB/s | 5734 MB/s | 48056 MB/s | 42068 MB/s | +| alice29.txt (10000B) | 6648 | 6218 | 10044 MB/s | 6055 MB/s | 32378 MB/s | 28813 MB/s | +| alice29.txt (20000B) | 12686 | 11492 | 7733 MB/s | 3143 MB/s | 30566 MB/s | 27315 MB/s | + Except for the mostly incompressible JPEG image compression is better and usually in the double digits in terms of percentage reduction over Snappy. @@ -585,29 +567,29 @@ Some examples compared on 16 core CPU, amd64 assembly used: ``` * enwik10 -Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s -Better... 10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s -Best... 10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s +Default... 10000000000 -> 4759950115 [47.60%]; 1.03s, 9263.0MB/s +Better... 10000000000 -> 4084706676 [40.85%]; 2.16s, 4415.4MB/s +Best... 10000000000 -> 3615520079 [36.16%]; 42.259s, 225.7MB/s * github-june-2days-2019.json -Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s -Better... 6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s -Best... 6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s +Default... 6273951764 -> 1041700255 [16.60%]; 431ms, 13882.3MB/s +Better... 6273951764 -> 945841238 [15.08%]; 547ms, 10938.4MB/s +Best... 6273951764 -> 826392576 [13.17%]; 9.455s, 632.8MB/s * nyc-taxi-data-10M.csv -Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s -Better... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s -Best... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s +Default... 3325605752 -> 1093516949 [32.88%]; 324ms, 9788.7MB/s +Better... 3325605752 -> 885394158 [26.62%]; 491ms, 6459.4MB/s +Best... 3325605752 -> 773681257 [23.26%]; 8.29s, 412.0MB/s * 10gb.tar -Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s -Better... 10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s -Best... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/ +Default... 10065157632 -> 5915541066 [58.77%]; 1.028s, 9337.4MB/s +Better... 10065157632 -> 5453844650 [54.19%]; 1.597s, 4862.7MB/s +Best... 10065157632 -> 5192495021 [51.59%]; 32.78s, 308.2MB/ * consensus.db.10gb -Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s -Better... 10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s -Best... 10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s +Default... 10737418240 -> 4549762344 [42.37%]; 882ms, 12118.4MB/s +Better... 10737418240 -> 4438535064 [41.34%]; 1.533s, 3500.9MB/s +Best... 10737418240 -> 4210602774 [39.21%]; 42.96s, 254.4MB/s ``` Decompression speed should be around the same as using the 'better' compression mode. @@ -628,10 +610,10 @@ If you would like more control, you can use the s2 package as described below: Snappy compatible blocks can be generated with the S2 encoder. Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace -| Snappy | S2 replacement | -|----------------------------|-------------------------| -| snappy.Encode(...) | s2.EncodeSnappy(...) | -| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) | +| Snappy | S2 replacement | +|---------------------------|-----------------------| +| snappy.Encode(...) | s2.EncodeSnappy(...) | +| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) | `s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output. @@ -640,12 +622,12 @@ Compression and speed is typically a bit better `MaxEncodedLen` is also smaller Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z), 53927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used: -| Encoder | Size | MB/s | Reduction | -|-----------------------|------------|------------|------------ -| snappy.Encode | 1128706759 | 725.59 | 71.89% | -| s2.EncodeSnappy | 1093823291 | **899.16** | 72.75% | -| s2.EncodeSnappyBetter | 1001158548 | 578.49 | 75.06% | -| s2.EncodeSnappyBest | 944507998 | 66.00 | **76.47%**| +| Encoder | Size | MB/s | Reduction | +|-----------------------|------------|------------|------------| +| snappy.Encode | 1128706759 | 725.59 | 71.89% | +| s2.EncodeSnappy | 1093823291 | **899.16** | 72.75% | +| s2.EncodeSnappyBetter | 1001158548 | 578.49 | 75.06% | +| s2.EncodeSnappyBest | 944507998 | 66.00 | **76.47%** | ## Streams @@ -704,7 +686,7 @@ To automatically add an index to a stream, add `WriterAddIndex()` option to your Then the index will be added to the stream when `Close()` is called. ``` - // Add Index to stream... + // Add Index to stream... enc := s2.NewWriter(w, s2.WriterAddIndex()) io.Copy(enc, r) enc.Close() @@ -714,7 +696,7 @@ If you want to store the index separately, you can use `CloseIndex()` instead of This will return the index. Note that `CloseIndex()` should only be called once, and you shouldn't call `Close()`. ``` - // Get index for separate storage... + // Get index for separate storage... enc := s2.NewWriter(w) io.Copy(enc, r) index, err := enc.CloseIndex() @@ -815,6 +797,13 @@ This is done using the regular "Skip" function: This will ensure that we are at exactly the offset we want, and reading from `dec` will start at the requested offset. +# Compact storage + +For compact storage [RemoveIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RemoveIndexHeaders) can be used to remove any redundant info from +a serialized index. If you remove the header it must be restored before [Loading](https://pkg.go.dev/github.com/klauspost/compress/s2#Index.Load). + +This is expected to save 20 bytes. These can be restored using [RestoreIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RestoreIndexHeaders). This removes a layer of security, but is the most compact representation. Returns nil if headers contains errors. + ## Index Format: Each block is structured as a snappy skippable block, with the chunk ID 0x99. @@ -824,20 +813,20 @@ The block can be read from the front, but contains information so it can be read Numbers are stored as fixed size little endian values or [zigzag encoded](https://developers.google.com/protocol-buffers/docs/encoding#signed_integers) [base 128 varints](https://developers.google.com/protocol-buffers/docs/encoding), with un-encoded value length of 64 bits, unless other limits are specified. -| Content | Format | -|---------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------| -| ID, `[1]byte` | Always 0x99. | -| Data Length, `[3]byte` | 3 byte little-endian length of the chunk in bytes, following this. | -| Header `[6]byte` | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00". | -| UncompressedSize, Varint | Total Uncompressed size. | -| CompressedSize, Varint | Total Compressed size if known. Should be -1 if unknown. | -| EstBlockSize, Varint | Block Size, used for guessing uncompressed offsets. Must be >= 0. | -| Entries, Varint | Number of Entries in index, must be < 65536 and >=0. | -| HasUncompressedOffsets `byte` | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid. | -| UncompressedOffsets, [Entries]VarInt | Uncompressed offsets. See below how to decode. | -| CompressedOffsets, [Entries]VarInt | Compressed offsets. See below how to decode. | -| Block Size, `[4]byte` | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block. | -| Trailer `[6]byte` | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. | +| Content | Format | +|--------------------------------------|-------------------------------------------------------------------------------------------------------------------------------| +| ID, `[1]byte` | Always 0x99. | +| Data Length, `[3]byte` | 3 byte little-endian length of the chunk in bytes, following this. | +| Header `[6]byte` | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00". | +| UncompressedSize, Varint | Total Uncompressed size. | +| CompressedSize, Varint | Total Compressed size if known. Should be -1 if unknown. | +| EstBlockSize, Varint | Block Size, used for guessing uncompressed offsets. Must be >= 0. | +| Entries, Varint | Number of Entries in index, must be < 65536 and >=0. | +| HasUncompressedOffsets `byte` | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid. | +| UncompressedOffsets, [Entries]VarInt | Uncompressed offsets. See below how to decode. | +| CompressedOffsets, [Entries]VarInt | Compressed offsets. See below how to decode. | +| Block Size, `[4]byte` | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block. | +| Trailer `[6]byte` | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. | For regular streams the uncompressed offsets are fully predictable, so `HasUncompressedOffsets` allows to specify that compressed blocks all have @@ -873,7 +862,7 @@ for each entry { } // Uncompressed uses previous offset and adds EstBlockSize - entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize + entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize + uOff } @@ -894,13 +883,22 @@ for each entry { } // Compressed uses previous and our estimate. - entry[entryNum].CompressedOffset = entry[entryNum-1].CompressedOffset + CompressGuess + entry[entryNum].CompressedOffset = entry[entryNum-1].CompressedOffset + CompressGuess + cOff // Adjust compressed offset for next loop, integer truncating division must be used. CompressGuess += cOff/2 } ``` +To decode from any given uncompressed offset `(wantOffset)`: + +* Iterate entries until `entry[n].UncompressedOffset > wantOffset`. +* Start decoding from `entry[n-1].CompressedOffset`. +* Discard `entry[n-1].UncompressedOffset - wantOffset` bytes from the decoded stream. + +See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-indexes) for functions that perform the operations with a simpler interface. + + # Format Extensions * Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`. @@ -923,10 +921,11 @@ The length is specified by reading the 3-bit length specified in the tag and dec | 7 | 65540 + read 3 bytes | This allows any repeat offset + length to be represented by 2 to 5 bytes. +It also allows to emit matches longer than 64 bytes with one copy + one repeat instead of several 64 byte copies. Lengths are stored as little endian values. -The first copy of a block cannot be a repeat offset and the offset is not carried across blocks in streams. +The first copy of a block cannot be a repeat offset and the offset is reset on every block in streams. Default streaming block size is 1MB. diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go index 9e7fce8856e0..27c0f3c2c45f 100644 --- a/vendor/github.com/klauspost/compress/s2/decode.go +++ b/vendor/github.com/klauspost/compress/s2/decode.go @@ -11,6 +11,9 @@ import ( "fmt" "io" "io/ioutil" + "math" + "runtime" + "sync" ) var ( @@ -169,6 +172,14 @@ func ReaderSkippableCB(id uint8, fn func(r io.Reader) error) ReaderOption { } } +// ReaderIgnoreCRC will make the reader skip CRC calculation and checks. +func ReaderIgnoreCRC() ReaderOption { + return func(r *Reader) error { + r.ignoreCRC = true + return nil + } +} + // Reader is an io.Reader that can read Snappy-compressed bytes. type Reader struct { r io.Reader @@ -191,18 +202,19 @@ type Reader struct { paramsOK bool snappyFrame bool ignoreStreamID bool + ignoreCRC bool } // ensureBufferSize will ensure that the buffer can take at least n bytes. // If false is returned the buffer exceeds maximum allowed size. func (r *Reader) ensureBufferSize(n int) bool { - if len(r.buf) >= n { - return true - } if n > r.maxBufSize { r.err = ErrCorrupt return false } + if cap(r.buf) >= n { + return true + } // Realloc buffer. r.buf = make([]byte, n) return true @@ -220,6 +232,7 @@ func (r *Reader) Reset(reader io.Reader) { r.err = nil r.i = 0 r.j = 0 + r.blockStart = 0 r.readHeader = r.ignoreStreamID } @@ -344,7 +357,7 @@ func (r *Reader) Read(p []byte) (int, error) { r.err = err return 0, r.err } - if crc(r.decoded[:n]) != checksum { + if !r.ignoreCRC && crc(r.decoded[:n]) != checksum { r.err = ErrCRC return 0, r.err } @@ -385,7 +398,7 @@ func (r *Reader) Read(p []byte) (int, error) { if !r.readFull(r.decoded[:n], false) { return 0, r.err } - if crc(r.decoded[:n]) != checksum { + if !r.ignoreCRC && crc(r.decoded[:n]) != checksum { r.err = ErrCRC return 0, r.err } @@ -435,6 +448,259 @@ func (r *Reader) Read(p []byte) (int, error) { } } +// DecodeConcurrent will decode the full stream to w. +// This function should not be combined with reading, seeking or other operations. +// Up to 'concurrent' goroutines will be used. +// If <= 0, runtime.NumCPU will be used. +// On success the number of bytes decompressed nil and is returned. +// This is mainly intended for bigger streams. +func (r *Reader) DecodeConcurrent(w io.Writer, concurrent int) (written int64, err error) { + if r.i > 0 || r.j > 0 || r.blockStart > 0 { + return 0, errors.New("DecodeConcurrent called after ") + } + if concurrent <= 0 { + concurrent = runtime.NumCPU() + } + + // Write to output + var errMu sync.Mutex + var aErr error + setErr := func(e error) (ok bool) { + errMu.Lock() + defer errMu.Unlock() + if e == nil { + return aErr == nil + } + if aErr == nil { + aErr = e + } + return false + } + hasErr := func() (ok bool) { + errMu.Lock() + v := aErr != nil + errMu.Unlock() + return v + } + + var aWritten int64 + toRead := make(chan []byte, concurrent) + writtenBlocks := make(chan []byte, concurrent) + queue := make(chan chan []byte, concurrent) + reUse := make(chan chan []byte, concurrent) + for i := 0; i < concurrent; i++ { + toRead <- make([]byte, 0, r.maxBufSize) + writtenBlocks <- make([]byte, 0, r.maxBufSize) + reUse <- make(chan []byte, 1) + } + // Writer + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + for toWrite := range queue { + entry := <-toWrite + reUse <- toWrite + if hasErr() { + writtenBlocks <- entry + continue + } + n, err := w.Write(entry) + want := len(entry) + writtenBlocks <- entry + if err != nil { + setErr(err) + continue + } + if n != want { + setErr(io.ErrShortWrite) + continue + } + aWritten += int64(n) + } + }() + + // Reader + defer func() { + close(queue) + if r.err != nil { + err = r.err + setErr(r.err) + } + wg.Wait() + if err == nil { + err = aErr + } + written = aWritten + }() + + for !hasErr() { + if !r.readFull(r.buf[:4], true) { + if r.err == io.EOF { + r.err = nil + } + return 0, r.err + } + chunkType := r.buf[0] + if !r.readHeader { + if chunkType != chunkTypeStreamIdentifier { + r.err = ErrCorrupt + return 0, r.err + } + r.readHeader = true + } + chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16 + + // The chunk types are specified at + // https://github.com/google/snappy/blob/master/framing_format.txt + switch chunkType { + case chunkTypeCompressedData: + r.blockStart += int64(r.j) + // Section 4.2. Compressed data (chunk type 0x00). + if chunkLen < checksumSize { + r.err = ErrCorrupt + return 0, r.err + } + if chunkLen > r.maxBufSize { + r.err = ErrCorrupt + return 0, r.err + } + orgBuf := <-toRead + buf := orgBuf[:chunkLen] + + if !r.readFull(buf, false) { + return 0, r.err + } + + checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 + buf = buf[checksumSize:] + + n, err := DecodedLen(buf) + if err != nil { + r.err = err + return 0, r.err + } + if r.snappyFrame && n > maxSnappyBlockSize { + r.err = ErrCorrupt + return 0, r.err + } + + if n > r.maxBlock { + r.err = ErrCorrupt + return 0, r.err + } + wg.Add(1) + + decoded := <-writtenBlocks + entry := <-reUse + queue <- entry + go func() { + defer wg.Done() + decoded = decoded[:n] + _, err := Decode(decoded, buf) + toRead <- orgBuf + if err != nil { + writtenBlocks <- decoded + setErr(err) + return + } + if !r.ignoreCRC && crc(decoded) != checksum { + writtenBlocks <- decoded + setErr(ErrCRC) + return + } + entry <- decoded + }() + continue + + case chunkTypeUncompressedData: + + // Section 4.3. Uncompressed data (chunk type 0x01). + if chunkLen < checksumSize { + r.err = ErrCorrupt + return 0, r.err + } + if chunkLen > r.maxBufSize { + r.err = ErrCorrupt + return 0, r.err + } + // Grab write buffer + orgBuf := <-writtenBlocks + buf := orgBuf[:checksumSize] + if !r.readFull(buf, false) { + return 0, r.err + } + checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 + // Read content. + n := chunkLen - checksumSize + + if r.snappyFrame && n > maxSnappyBlockSize { + r.err = ErrCorrupt + return 0, r.err + } + if n > r.maxBlock { + r.err = ErrCorrupt + return 0, r.err + } + // Read uncompressed + buf = orgBuf[:n] + if !r.readFull(buf, false) { + return 0, r.err + } + + if !r.ignoreCRC && crc(buf) != checksum { + r.err = ErrCRC + return 0, r.err + } + entry := <-reUse + queue <- entry + entry <- buf + continue + + case chunkTypeStreamIdentifier: + // Section 4.1. Stream identifier (chunk type 0xff). + if chunkLen != len(magicBody) { + r.err = ErrCorrupt + return 0, r.err + } + if !r.readFull(r.buf[:len(magicBody)], false) { + return 0, r.err + } + if string(r.buf[:len(magicBody)]) != magicBody { + if string(r.buf[:len(magicBody)]) != magicBodySnappy { + r.err = ErrCorrupt + return 0, r.err + } else { + r.snappyFrame = true + } + } else { + r.snappyFrame = false + } + continue + } + + if chunkType <= 0x7f { + // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f). + // fmt.Printf("ERR chunktype: 0x%x\n", chunkType) + r.err = ErrUnsupported + return 0, r.err + } + // Section 4.4 Padding (chunk type 0xfe). + // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd). + if chunkLen > maxChunkSize { + // fmt.Printf("ERR chunkLen: 0x%x\n", chunkLen) + r.err = ErrUnsupported + return 0, r.err + } + + // fmt.Printf("skippable: ID: 0x%x, len: 0x%x\n", chunkType, chunkLen) + if !r.skippable(r.buf, chunkLen, false, chunkType) { + return 0, r.err + } + } + return 0, r.err +} + // Skip will skip n bytes forward in the decompressed output. // For larger skips this consumes less CPU and is faster than reading output and discarding it. // CRC is not checked on skipped blocks. @@ -454,7 +720,11 @@ func (r *Reader) Skip(n int64) error { // decoded[i:j] contains decoded bytes that have not yet been passed on. left := int64(r.j - r.i) if left >= n { - r.i += int(n) + tmp := int64(r.i) + n + if tmp > math.MaxInt32 { + return errors.New("s2: internal overflow in skip") + } + r.i = int(tmp) return nil } n -= int64(r.j - r.i) @@ -526,6 +796,7 @@ func (r *Reader) Skip(n int64) error { } else { // Skip block completely n -= int64(dLen) + r.blockStart += int64(dLen) dLen = 0 } r.i, r.j = 0, dLen @@ -656,6 +927,15 @@ func (r *Reader) ReadSeeker(random bool, index []byte) (*ReadSeeker, error) { err = r.index.LoadStream(rs) if err != nil { if err == ErrUnsupported { + // If we don't require random seeking, reset input and return. + if !random { + _, err = rs.Seek(pos, io.SeekStart) + if err != nil { + return nil, ErrCantSeek{Reason: "resetting stream returned: " + err.Error()} + } + r.index = nil + return &ReadSeeker{Reader: r}, nil + } return nil, ErrCantSeek{Reason: "input stream does not contain an index"} } return nil, ErrCantSeek{Reason: "reading index returned: " + err.Error()} @@ -699,8 +979,16 @@ func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) { case io.SeekCurrent: offset += r.blockStart + int64(r.i) case io.SeekEnd: - offset = -offset + if offset > 0 { + return 0, errors.New("seek after end of file") + } + offset = r.index.TotalUncompressed + offset + } + + if offset < 0 { + return 0, errors.New("seek before start of file") } + c, u, err := r.index.Find(offset) if err != nil { return r.blockStart + int64(r.i), err @@ -712,10 +1000,6 @@ func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) { return 0, err } - if offset < 0 { - offset = r.index.TotalUncompressed + offset - } - r.i = r.j // Remove rest of current block. if u < offset { // Forward inside block diff --git a/vendor/github.com/klauspost/compress/s2/decode_other.go b/vendor/github.com/klauspost/compress/s2/decode_other.go index 1074ebd215e1..11300c3a8104 100644 --- a/vendor/github.com/klauspost/compress/s2/decode_other.go +++ b/vendor/github.com/klauspost/compress/s2/decode_other.go @@ -28,6 +28,9 @@ func s2Decode(dst, src []byte) int { // As long as we can read at least 5 bytes... for s < len(src)-5 { + // Removing bounds checks is SLOWER, when if doing + // in := src[s:s+5] + // Checked on Go 1.18 switch src[s] & 0x03 { case tagLiteral: x := uint32(src[s] >> 2) @@ -38,14 +41,19 @@ func s2Decode(dst, src []byte) int { s += 2 x = uint32(src[s-1]) case x == 61: + in := src[s : s+3] + x = uint32(in[1]) | uint32(in[2])<<8 s += 3 - x = uint32(src[s-2]) | uint32(src[s-1])<<8 case x == 62: + in := src[s : s+4] + // Load as 32 bit and shift down. + x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24 + x >>= 8 s += 4 - x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 case x == 63: + in := src[s : s+5] + x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24 s += 5 - x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 } length = int(x) + 1 if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) { @@ -62,8 +70,8 @@ func s2Decode(dst, src []byte) int { case tagCopy1: s += 2 - length = int(src[s-2]) >> 2 & 0x7 toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) + length = int(src[s-2]) >> 2 & 0x7 if toffset == 0 { if debug { fmt.Print("(repeat) ") @@ -71,14 +79,16 @@ func s2Decode(dst, src []byte) int { // keep last offset switch length { case 5: + length = int(src[s]) + 4 s += 1 - length = int(uint32(src[s-1])) + 4 case 6: + in := src[s : s+2] + length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8) s += 2 - length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8) case 7: + in := src[s : s+3] + length = int((uint32(in[2])<<16)|(uint32(in[1])<<8)|uint32(in[0])) + (1 << 16) s += 3 - length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16) default: // 0-> 4 } } else { @@ -86,14 +96,16 @@ func s2Decode(dst, src []byte) int { } length += 4 case tagCopy2: + in := src[s : s+3] + offset = int(uint32(in[1]) | uint32(in[2])<<8) + length = 1 + int(in[0])>>2 s += 3 - length = 1 + int(src[s-3])>>2 - offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) case tagCopy4: + in := src[s : s+5] + offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24) + length = 1 + int(in[0])>>2 s += 5 - length = 1 + int(src[s-5])>>2 - offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) } if offset <= 0 || d < offset || length > len(dst)-d { diff --git a/vendor/github.com/klauspost/compress/s2/encode.go b/vendor/github.com/klauspost/compress/s2/encode.go index 59f992ca6ebe..1aefabf31362 100644 --- a/vendor/github.com/klauspost/compress/s2/encode.go +++ b/vendor/github.com/klauspost/compress/s2/encode.go @@ -1119,12 +1119,6 @@ func (w *Writer) closeIndex(idx bool) ([]byte, error) { if w.appendIndex { w.written += int64(len(index)) } - if true { - _, err := w.index.Load(index) - if err != nil { - panic(err) - } - } } if w.pad > 1 { diff --git a/vendor/github.com/klauspost/compress/s2/encode_all.go b/vendor/github.com/klauspost/compress/s2/encode_all.go index 8b16c38a68f6..54c71d3b5d90 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_all.go +++ b/vendor/github.com/klauspost/compress/s2/encode_all.go @@ -58,8 +58,9 @@ func encodeGo(dst, src []byte) []byte { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockGo(dst, src []byte) (d int) { // Initialize the hash table. const ( diff --git a/vendor/github.com/klauspost/compress/s2/encode_amd64.go b/vendor/github.com/klauspost/compress/s2/encode_amd64.go index e612225f4d35..6b93daa5ae68 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_amd64.go +++ b/vendor/github.com/klauspost/compress/s2/encode_amd64.go @@ -8,8 +8,9 @@ package s2 // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlock(dst, src []byte) (d int) { const ( // Use 12 bit table when less than... @@ -43,8 +44,9 @@ func encodeBlock(dst, src []byte) (d int) { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockBetter(dst, src []byte) (d int) { const ( // Use 12 bit table when less than... @@ -78,8 +80,9 @@ func encodeBlockBetter(dst, src []byte) (d int) { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockSnappy(dst, src []byte) (d int) { const ( // Use 12 bit table when less than... @@ -112,8 +115,9 @@ func encodeBlockSnappy(dst, src []byte) (d int) { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockBetterSnappy(dst, src []byte) (d int) { const ( // Use 12 bit table when less than... diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go index 4480347769a0..1b7ea394fae9 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_best.go +++ b/vendor/github.com/klauspost/compress/s2/encode_best.go @@ -15,8 +15,9 @@ import ( // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockBest(dst, src []byte) (d int) { // Initialize the hash tables. const ( @@ -176,14 +177,21 @@ func encodeBlockBest(dst, src []byte) (d int) { best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false)) } // Search for a match at best match end, see if that is better. - if sAt := best.s + best.length; sAt < sLimit { - sBack := best.s - backL := best.length + // Allow some bytes at the beginning to mismatch. + // Sweet spot is around 1-2 bytes, but depends on input. + // The skipped bytes are tested in Extend backwards, + // and still picked up as part of the match if they do. + const skipBeginning = 2 + const skipEnd = 1 + if sAt := best.s + best.length - skipEnd; sAt < sLimit { + + sBack := best.s + skipBeginning - skipEnd + backL := best.length - skipBeginning // Load initial values cv = load64(src, sBack) - // Search for mismatch + + // Grab candidates... next := lTable[hash8(load64(src, sAt), lTableBits)] - //next := sTable[hash4(load64(src, sAt), sTableBits)] if checkAt := getCur(next) - backL; checkAt > 0 { best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) @@ -191,6 +199,16 @@ func encodeBlockBest(dst, src []byte) (d int) { if checkAt := getPrev(next) - backL; checkAt > 0 { best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) } + // Disabled: Extremely small gain + if false { + next = sTable[hash4(load64(src, sAt), sTableBits)] + if checkAt := getCur(next) - backL; checkAt > 0 { + best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) + } + if checkAt := getPrev(next) - backL; checkAt > 0 { + best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) + } + } } } } @@ -288,8 +306,9 @@ emitRemainder: // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockBestSnappy(dst, src []byte) (d int) { // Initialize the hash tables. const ( @@ -370,7 +389,7 @@ func encodeBlockBestSnappy(dst, src []byte) (d int) { } offset := m.s - m.offset - return score - emitCopySize(offset, m.length) + return score - emitCopyNoRepeatSize(offset, m.length) } matchAt := func(offset, s int, first uint32) match { @@ -546,6 +565,7 @@ emitRemainder: // emitCopySize returns the size to encode the offset+length // // It assumes that: +// // 1 <= offset && offset <= math.MaxUint32 // 4 <= length && length <= 1 << 24 func emitCopySize(offset, length int) int { @@ -567,6 +587,10 @@ func emitCopySize(offset, length int) int { // Offset no more than 2 bytes. if length > 64 { + if offset < 2048 { + // Emit 8 bytes, then rest as repeats... + return 2 + emitRepeatSize(offset, length-8) + } // Emit remaining as repeats, at least 4 bytes remain. return 3 + emitRepeatSize(offset, length-60) } @@ -577,6 +601,29 @@ func emitCopySize(offset, length int) int { return 2 } +// emitCopyNoRepeatSize returns the size to encode the offset+length +// +// It assumes that: +// +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 +func emitCopyNoRepeatSize(offset, length int) int { + if offset >= 65536 { + return 5 + 5*(length/64) + } + + // Offset no more than 2 bytes. + if length > 64 { + // Emit remaining as repeats, at least 4 bytes remain. + return 3 + 3*(length/60) + } + if length >= 12 || offset >= 2048 { + return 3 + } + // Emit the remaining copy, encoded as 2 bytes. + return 2 +} + // emitRepeatSize returns the number of bytes required to encode a repeat. // Length must be at least 4 and < 1<<24 func emitRepeatSize(offset, length int) int { diff --git a/vendor/github.com/klauspost/compress/s2/encode_better.go b/vendor/github.com/klauspost/compress/s2/encode_better.go index 943215b8ae86..3b66ba42bf39 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_better.go +++ b/vendor/github.com/klauspost/compress/s2/encode_better.go @@ -42,8 +42,9 @@ func hash8(u uint64, h uint8) uint32 { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockBetterGo(dst, src []byte) (d int) { // sLimit is when to stop looking for offset/length copies. The inputMargin // lets us use a fast path for emitLiteral in the main loop, while we are @@ -56,7 +57,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { // Initialize the hash tables. const ( // Long hash matches. - lTableBits = 16 + lTableBits = 17 maxLTableSize = 1 << lTableBits // Short hash matches. @@ -97,9 +98,26 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { lTable[hashL] = uint32(s) sTable[hashS] = uint32(s) + valLong := load64(src, candidateL) + valShort := load64(src, candidateS) + + // If long matches at least 8 bytes, use that. + if cv == valLong { + break + } + if cv == valShort { + candidateL = candidateS + break + } + // Check repeat at offset checkRep. const checkRep = 1 - if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { + // Minimum length of a repeat. Tested with various values. + // While 4-5 offers improvements in some, 6 reduces + // regressions significantly. + const wantRepeatBytes = 6 + const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep) + if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask { base := s + checkRep // Extend back for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { @@ -109,8 +127,8 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { d += emitLiteral(dst[d:], src[nextEmit:base]) // Extend forward - candidate := s - repeat + 4 + checkRep - s += 4 + checkRep + candidate := s - repeat + wantRepeatBytes + checkRep + s += wantRepeatBytes + checkRep for s < len(src) { if len(src)-s < 8 { if src[s] == src[candidate] { @@ -127,28 +145,40 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { s += 8 candidate += 8 } - if nextEmit > 0 { - // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. - d += emitRepeat(dst[d:], repeat, s-base) - } else { - // First match, cannot be repeat. - d += emitCopy(dst[d:], repeat, s-base) - } + // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. + d += emitRepeat(dst[d:], repeat, s-base) nextEmit = s if s >= sLimit { goto emitRemainder } + // Index in-between + index0 := base + 1 + index1 := s - 2 + + cv = load64(src, s) + for index0 < index1 { + cv0 := load64(src, index0) + cv1 := load64(src, index1) + lTable[hash7(cv0, lTableBits)] = uint32(index0) + sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) + + lTable[hash7(cv1, lTableBits)] = uint32(index1) + sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + index0 += 2 + index1 -= 2 + } cv = load64(src, s) continue } - if uint32(cv) == load32(src, candidateL) { + // Long likely matches 7, so take that. + if uint32(cv) == uint32(valLong) { break } // Check our short candidate - if uint32(cv) == load32(src, candidateS) { + if uint32(cv) == uint32(valShort) { // Try a long candidate at s+1 hashL = hash7(cv>>8, lTableBits) candidateL = int(lTable[hashL]) @@ -227,21 +257,29 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { // Do we have space for more, if not bail. return 0 } - // Index match start+1 (long) and start+2 (short) + + // Index short & long index0 := base + 1 - // Index match end-2 (long) and end-1 (short) index1 := s - 2 cv0 := load64(src, index0) cv1 := load64(src, index1) - cv = load64(src, s) lTable[hash7(cv0, lTableBits)] = uint32(index0) - lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1) - lTable[hash7(cv1, lTableBits)] = uint32(index1) - lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1) sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) - sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2) + + lTable[hash7(cv1, lTableBits)] = uint32(index1) sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + index0 += 1 + index1 -= 1 + cv = load64(src, s) + + // index every second long in between. + for index0 < index1 { + lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0) + lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1) + index0 += 2 + index1 -= 2 + } } emitRemainder: @@ -260,8 +298,9 @@ emitRemainder: // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockBetterSnappyGo(dst, src []byte) (d int) { // sLimit is when to stop looking for offset/length copies. The inputMargin // lets us use a fast path for emitLiteral in the main loop, while we are @@ -402,21 +441,29 @@ func encodeBlockBetterSnappyGo(dst, src []byte) (d int) { // Do we have space for more, if not bail. return 0 } - // Index match start+1 (long) and start+2 (short) + + // Index short & long index0 := base + 1 - // Index match end-2 (long) and end-1 (short) index1 := s - 2 cv0 := load64(src, index0) cv1 := load64(src, index1) - cv = load64(src, s) lTable[hash7(cv0, lTableBits)] = uint32(index0) - lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1) - lTable[hash7(cv1, lTableBits)] = uint32(index1) - lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1) sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) - sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2) + + lTable[hash7(cv1, lTableBits)] = uint32(index1) sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + index0 += 1 + index1 -= 1 + cv = load64(src, s) + + // index every second long in between. + for index0 < index1 { + lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0) + lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1) + index0 += 2 + index1 -= 2 + } } emitRemainder: diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go index 43d43534e4f4..db08fc355e1e 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_go.go +++ b/vendor/github.com/klauspost/compress/s2/encode_go.go @@ -12,6 +12,7 @@ import ( // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) func encodeBlock(dst, src []byte) (d int) { if len(src) < minNonLiteralBlockSize { @@ -25,6 +26,7 @@ func encodeBlock(dst, src []byte) (d int) { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) func encodeBlockBetter(dst, src []byte) (d int) { return encodeBlockBetterGo(dst, src) @@ -35,6 +37,7 @@ func encodeBlockBetter(dst, src []byte) (d int) { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) func encodeBlockBetterSnappy(dst, src []byte) (d int) { return encodeBlockBetterSnappyGo(dst, src) @@ -45,6 +48,7 @@ func encodeBlockBetterSnappy(dst, src []byte) (d int) { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) func encodeBlockSnappy(dst, src []byte) (d int) { if len(src) < minNonLiteralBlockSize { @@ -56,6 +60,7 @@ func encodeBlockSnappy(dst, src []byte) (d int) { // emitLiteral writes a literal chunk and returns the number of bytes written. // // It assumes that: +// // dst is long enough to hold the encoded bytes // 0 <= len(lit) && len(lit) <= math.MaxUint32 func emitLiteral(dst, lit []byte) int { @@ -146,6 +151,7 @@ func emitRepeat(dst []byte, offset, length int) int { // emitCopy writes a copy chunk and returns the number of bytes written. // // It assumes that: +// // dst is long enough to hold the encoded bytes // 1 <= offset && offset <= math.MaxUint32 // 4 <= length && length <= 1 << 24 @@ -180,14 +186,23 @@ func emitCopy(dst []byte, offset, length int) int { // Offset no more than 2 bytes. if length > 64 { - // Emit a length 60 copy, encoded as 3 bytes. - // Emit remaining as repeat value (minimum 4 bytes). - dst[2] = uint8(offset >> 8) - dst[1] = uint8(offset) - dst[0] = 59<<2 | tagCopy2 - length -= 60 + off := 3 + if offset < 2048 { + // emit 8 bytes as tagCopy1, rest as repeats. + dst[1] = uint8(offset) + dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1 + length -= 8 + off = 2 + } else { + // Emit a length 60 copy, encoded as 3 bytes. + // Emit remaining as repeat value (minimum 4 bytes). + dst[2] = uint8(offset >> 8) + dst[1] = uint8(offset) + dst[0] = 59<<2 | tagCopy2 + length -= 60 + } // Emit remaining as repeats, at least 4 bytes remain. - return 3 + emitRepeat(dst[3:], offset, length) + return off + emitRepeat(dst[off:], offset, length) } if length >= 12 || offset >= 2048 { // Emit the remaining copy, encoded as 3 bytes. @@ -205,6 +220,7 @@ func emitCopy(dst []byte, offset, length int) int { // emitCopyNoRepeat writes a copy chunk and returns the number of bytes written. // // It assumes that: +// // dst is long enough to hold the encoded bytes // 1 <= offset && offset <= math.MaxUint32 // 4 <= length && length <= 1 << 24 @@ -264,8 +280,8 @@ func emitCopyNoRepeat(dst []byte, offset, length int) int { // matchLen returns how many bytes match in a and b // // It assumes that: -// len(a) <= len(b) // +// len(a) <= len(b) func matchLen(a []byte, b []byte) int { b = b[:len(a)] var checked int diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go index c8cf7b69e81d..7e00bac3eae5 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go @@ -1,10 +1,11 @@ // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. -//go:build !appengine && !noasm && gc -// +build !appengine,!noasm,gc +//go:build !appengine && !noasm && gc && !noasm package s2 +func _dummy_() + // encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 4294967295 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. @@ -148,8 +149,9 @@ func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int // emitLiteral writes a literal chunk and returns the number of bytes written. // // It assumes that: -// dst is long enough to hold the encoded bytes with margin of 0 bytes -// 0 <= len(lit) && len(lit) <= math.MaxUint32 +// +// dst is long enough to hold the encoded bytes with margin of 0 bytes +// 0 <= len(lit) && len(lit) <= math.MaxUint32 // //go:noescape func emitLiteral(dst []byte, lit []byte) int @@ -163,9 +165,10 @@ func emitRepeat(dst []byte, offset int, length int) int // emitCopy writes a copy chunk and returns the number of bytes written. // // It assumes that: -// dst is long enough to hold the encoded bytes -// 1 <= offset && offset <= math.MaxUint32 -// 4 <= length && length <= 1 << 24 +// +// dst is long enough to hold the encoded bytes +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 // //go:noescape func emitCopy(dst []byte, offset int, length int) int @@ -173,9 +176,10 @@ func emitCopy(dst []byte, offset int, length int) int // emitCopyNoRepeat writes a copy chunk and returns the number of bytes written. // // It assumes that: -// dst is long enough to hold the encoded bytes -// 1 <= offset && offset <= math.MaxUint32 -// 4 <= length && length <= 1 << 24 +// +// dst is long enough to hold the encoded bytes +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 // //go:noescape func emitCopyNoRepeat(dst []byte, offset int, length int) int @@ -183,7 +187,8 @@ func emitCopyNoRepeat(dst []byte, offset int, length int) int // matchLen returns how many bytes match in a and b // // It assumes that: -// len(a) <= len(b) +// +// len(a) <= len(b) // //go:noescape func matchLen(a []byte, b []byte) int diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s index 1ac65a0e352f..81a487d6defa 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s @@ -1,13 +1,20 @@ // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. -// +build !appengine -// +build !noasm -// +build gc +//go:build !appengine && !noasm && gc && !noasm #include "textflag.h" +// func _dummy_() +TEXT ·_dummy_(SB), $0 +#ifdef GOAMD64_v4 +#ifndef GOAMD64_v3 +#define GOAMD64_v3 +#endif +#endif + RET + // func encodeBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBlockAsm(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX @@ -243,35 +250,57 @@ emit_literal_done_repeat_emit_encodeBlockAsm: // matchLen XORL R12, R12 CMPL R9, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm + JL matchlen_match4_repeat_extend_encodeBlockAsm matchlen_loopback_repeat_extend_encodeBlockAsm: MOVQ (R10)(R12*1), R11 XORQ (SI)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm matchlen_loop_repeat_extend_encodeBlockAsm: LEAL -8(R9), R9 LEAL 8(R12), R12 CMPL R9, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm - -matchlen_single_repeat_extend_encodeBlockAsm: - TESTL R9, R9 - JZ repeat_extend_forward_end_encodeBlockAsm - -matchlen_single_loopback_repeat_extend_encodeBlockAsm: + JZ repeat_extend_forward_end_encodeBlockAsm + +matchlen_match4_repeat_extend_encodeBlockAsm: + CMPL R9, $0x04 + JL matchlen_match2_repeat_extend_encodeBlockAsm + MOVL (R10)(R12*1), R11 + CMPL (SI)(R12*1), R11 + JNE matchlen_match2_repeat_extend_encodeBlockAsm + SUBL $0x04, R9 + LEAL 4(R12), R12 + +matchlen_match2_repeat_extend_encodeBlockAsm: + CMPL R9, $0x02 + JL matchlen_match1_repeat_extend_encodeBlockAsm + MOVW (R10)(R12*1), R11 + CMPW (SI)(R12*1), R11 + JNE matchlen_match1_repeat_extend_encodeBlockAsm + SUBL $0x02, R9 + LEAL 2(R12), R12 + +matchlen_match1_repeat_extend_encodeBlockAsm: + CMPL R9, $0x01 + JL repeat_extend_forward_end_encodeBlockAsm MOVB (R10)(R12*1), R11 CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm LEAL 1(R12), R12 - DECL R9 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm repeat_extend_forward_end_encodeBlockAsm: ADDL R12, CX @@ -444,6 +473,90 @@ four_bytes_remain_repeat_as_copy_encodeBlockAsm: two_byte_offset_repeat_as_copy_encodeBlockAsm: CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm + CMPL DI, $0x00000800 + JAE long_offset_short_repeat_as_copy_encodeBlockAsm + MOVL $0x00000001, R8 + LEAL 16(R8), R8 + MOVB DI, 1(AX) + MOVL DI, R9 + SHRL $0x08, R9 + SHLL $0x05, R9 + ORL R9, R8 + MOVB R8, (AX) + ADDQ $0x02, AX + SUBL $0x08, SI + + // emitRepeat + LEAL -4(SI), SI + JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + +emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + CMPL SI, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + CMPL SI, $0x0100ffff + JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + LEAL -16842747(SI), SI + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + +repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + +repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm + +long_offset_short_repeat_as_copy_encodeBlockAsm: MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(SI), SI @@ -748,35 +861,57 @@ match_nolit_loop_encodeBlockAsm: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm + JL matchlen_match4_match_nolit_encodeBlockAsm matchlen_loopback_match_nolit_encodeBlockAsm: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm matchlen_loop_match_nolit_encodeBlockAsm: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm - -matchlen_single_match_nolit_encodeBlockAsm: - TESTL DI, DI - JZ match_nolit_end_encodeBlockAsm - -matchlen_single_loopback_match_nolit_encodeBlockAsm: + JZ match_nolit_end_encodeBlockAsm + +matchlen_match4_match_nolit_encodeBlockAsm: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeBlockAsm + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeBlockAsm + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeBlockAsm: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeBlockAsm + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeBlockAsm + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeBlockAsm: + CMPL DI, $0x01 + JL match_nolit_end_encodeBlockAsm MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm match_nolit_end_encodeBlockAsm: ADDL R10, CX @@ -879,6 +1014,90 @@ four_bytes_remain_match_nolit_encodeBlockAsm: two_byte_offset_match_nolit_encodeBlockAsm: CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm + CMPL SI, $0x00000800 + JAE long_offset_short_match_nolit_encodeBlockAsm + MOVL $0x00000001, DI + LEAL 16(DI), DI + MOVB SI, 1(AX) + MOVL SI, R8 + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, DI + MOVB DI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R10 + + // emitRepeat + LEAL -4(R10), R10 + JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b + +emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b: + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b + CMPL R10, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b + CMPL R10, $0x0100ffff + JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b + LEAL -16842747(R10), R10 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b + +repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b: + LEAL -65536(R10), R10 + MOVL R10, SI + MOVW $0x001d, (AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b: + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm + +long_offset_short_match_nolit_encodeBlockAsm: MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(R10), R10 @@ -1069,17 +1288,36 @@ memmove_emit_remainder_encodeBlockAsm: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: @@ -1162,7 +1400,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm: RET // func encodeBlockAsm4MB(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBlockAsm4MB(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX @@ -1390,35 +1628,57 @@ emit_literal_done_repeat_emit_encodeBlockAsm4MB: // matchLen XORL R12, R12 CMPL R9, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm4MB + JL matchlen_match4_repeat_extend_encodeBlockAsm4MB matchlen_loopback_repeat_extend_encodeBlockAsm4MB: MOVQ (R10)(R12*1), R11 XORQ (SI)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm4MB + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm4MB matchlen_loop_repeat_extend_encodeBlockAsm4MB: LEAL -8(R9), R9 LEAL 8(R12), R12 CMPL R9, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB - -matchlen_single_repeat_extend_encodeBlockAsm4MB: - TESTL R9, R9 - JZ repeat_extend_forward_end_encodeBlockAsm4MB - -matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB: + JZ repeat_extend_forward_end_encodeBlockAsm4MB + +matchlen_match4_repeat_extend_encodeBlockAsm4MB: + CMPL R9, $0x04 + JL matchlen_match2_repeat_extend_encodeBlockAsm4MB + MOVL (R10)(R12*1), R11 + CMPL (SI)(R12*1), R11 + JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB + SUBL $0x04, R9 + LEAL 4(R12), R12 + +matchlen_match2_repeat_extend_encodeBlockAsm4MB: + CMPL R9, $0x02 + JL matchlen_match1_repeat_extend_encodeBlockAsm4MB + MOVW (R10)(R12*1), R11 + CMPW (SI)(R12*1), R11 + JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB + SUBL $0x02, R9 + LEAL 2(R12), R12 + +matchlen_match1_repeat_extend_encodeBlockAsm4MB: + CMPL R9, $0x01 + JL repeat_extend_forward_end_encodeBlockAsm4MB MOVB (R10)(R12*1), R11 CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm4MB LEAL 1(R12), R12 - DECL R9 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB repeat_extend_forward_end_encodeBlockAsm4MB: ADDL R12, CX @@ -1569,6 +1829,77 @@ four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB: two_byte_offset_repeat_as_copy_encodeBlockAsm4MB: CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB + CMPL DI, $0x00000800 + JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB + MOVL $0x00000001, R8 + LEAL 16(R8), R8 + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, R8 + MOVB R8, (AX) + ADDQ $0x02, AX + SUBL $0x08, SI + + // emitRepeat + LEAL -4(SI), SI + JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b + CMPL SI, $0x00010100 + JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm4MB + +long_offset_short_repeat_as_copy_encodeBlockAsm4MB: MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(SI), SI @@ -1854,35 +2185,57 @@ match_nolit_loop_encodeBlockAsm4MB: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm4MB + JL matchlen_match4_match_nolit_encodeBlockAsm4MB matchlen_loopback_match_nolit_encodeBlockAsm4MB: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm4MB - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm4MB + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm4MB matchlen_loop_match_nolit_encodeBlockAsm4MB: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB - -matchlen_single_match_nolit_encodeBlockAsm4MB: - TESTL DI, DI - JZ match_nolit_end_encodeBlockAsm4MB - -matchlen_single_loopback_match_nolit_encodeBlockAsm4MB: + JZ match_nolit_end_encodeBlockAsm4MB + +matchlen_match4_match_nolit_encodeBlockAsm4MB: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeBlockAsm4MB + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeBlockAsm4MB + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeBlockAsm4MB: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeBlockAsm4MB + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeBlockAsm4MB + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeBlockAsm4MB: + CMPL DI, $0x01 + JL match_nolit_end_encodeBlockAsm4MB MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm4MB LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm4MB match_nolit_end_encodeBlockAsm4MB: ADDL R10, CX @@ -1974,6 +2327,77 @@ four_bytes_remain_match_nolit_encodeBlockAsm4MB: two_byte_offset_match_nolit_encodeBlockAsm4MB: CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB + CMPL SI, $0x00000800 + JAE long_offset_short_match_nolit_encodeBlockAsm4MB + MOVL $0x00000001, DI + LEAL 16(DI), DI + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, DI + MOVB DI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R10 + + // emitRepeat + LEAL -4(R10), R10 + JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b + CMPL R10, $0x00010100 + JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b + LEAL -65536(R10), R10 + MOVL R10, SI + MOVW $0x001d, (AX) + MOVW R10, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm4MB + +long_offset_short_match_nolit_encodeBlockAsm4MB: MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(R10), R10 @@ -2145,17 +2569,36 @@ memmove_emit_remainder_encodeBlockAsm4MB: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: @@ -2238,7 +2681,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm4MB: RET // func encodeBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBlockAsm12B(SB), $16408-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000080, CX @@ -2455,35 +2898,57 @@ emit_literal_done_repeat_emit_encodeBlockAsm12B: // matchLen XORL R12, R12 CMPL R9, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm12B + JL matchlen_match4_repeat_extend_encodeBlockAsm12B matchlen_loopback_repeat_extend_encodeBlockAsm12B: MOVQ (R10)(R12*1), R11 XORQ (SI)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm12B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm12B matchlen_loop_repeat_extend_encodeBlockAsm12B: LEAL -8(R9), R9 LEAL 8(R12), R12 CMPL R9, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B - -matchlen_single_repeat_extend_encodeBlockAsm12B: - TESTL R9, R9 - JZ repeat_extend_forward_end_encodeBlockAsm12B - -matchlen_single_loopback_repeat_extend_encodeBlockAsm12B: + JZ repeat_extend_forward_end_encodeBlockAsm12B + +matchlen_match4_repeat_extend_encodeBlockAsm12B: + CMPL R9, $0x04 + JL matchlen_match2_repeat_extend_encodeBlockAsm12B + MOVL (R10)(R12*1), R11 + CMPL (SI)(R12*1), R11 + JNE matchlen_match2_repeat_extend_encodeBlockAsm12B + SUBL $0x04, R9 + LEAL 4(R12), R12 + +matchlen_match2_repeat_extend_encodeBlockAsm12B: + CMPL R9, $0x02 + JL matchlen_match1_repeat_extend_encodeBlockAsm12B + MOVW (R10)(R12*1), R11 + CMPW (SI)(R12*1), R11 + JNE matchlen_match1_repeat_extend_encodeBlockAsm12B + SUBL $0x02, R9 + LEAL 2(R12), R12 + +matchlen_match1_repeat_extend_encodeBlockAsm12B: + CMPL R9, $0x01 + JL repeat_extend_forward_end_encodeBlockAsm12B MOVB (R10)(R12*1), R11 CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm12B LEAL 1(R12), R12 - DECL R9 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm12B repeat_extend_forward_end_encodeBlockAsm12B: ADDL R12, CX @@ -2542,6 +3007,65 @@ repeat_as_copy_encodeBlockAsm12B: two_byte_offset_repeat_as_copy_encodeBlockAsm12B: CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B + CMPL DI, $0x00000800 + JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B + MOVL $0x00000001, R8 + LEAL 16(R8), R8 + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, R8 + MOVB R8, (AX) + ADDQ $0x02, AX + SUBL $0x08, SI + + // emitRepeat + LEAL -4(SI), SI + JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm12B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm12B + +long_offset_short_repeat_as_copy_encodeBlockAsm12B: MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(SI), SI @@ -2804,35 +3328,57 @@ match_nolit_loop_encodeBlockAsm12B: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm12B + JL matchlen_match4_match_nolit_encodeBlockAsm12B matchlen_loopback_match_nolit_encodeBlockAsm12B: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm12B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm12B matchlen_loop_match_nolit_encodeBlockAsm12B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm12B - -matchlen_single_match_nolit_encodeBlockAsm12B: - TESTL DI, DI - JZ match_nolit_end_encodeBlockAsm12B - -matchlen_single_loopback_match_nolit_encodeBlockAsm12B: + JZ match_nolit_end_encodeBlockAsm12B + +matchlen_match4_match_nolit_encodeBlockAsm12B: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeBlockAsm12B + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeBlockAsm12B + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeBlockAsm12B: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeBlockAsm12B + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeBlockAsm12B + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeBlockAsm12B: + CMPL DI, $0x01 + JL match_nolit_end_encodeBlockAsm12B MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm12B LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B match_nolit_end_encodeBlockAsm12B: ADDL R10, CX @@ -2844,6 +3390,65 @@ match_nolit_end_encodeBlockAsm12B: two_byte_offset_match_nolit_encodeBlockAsm12B: CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B + CMPL SI, $0x00000800 + JAE long_offset_short_match_nolit_encodeBlockAsm12B + MOVL $0x00000001, DI + LEAL 16(DI), DI + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, DI + MOVB DI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R10 + + // emitRepeat + LEAL -4(R10), R10 + JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm12B + +long_offset_short_match_nolit_encodeBlockAsm12B: MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(R10), R10 @@ -2992,17 +3597,36 @@ memmove_emit_remainder_encodeBlockAsm12B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: @@ -3085,7 +3709,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm12B: RET // func encodeBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBlockAsm10B(SB), $4120-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000020, CX @@ -3302,35 +3926,57 @@ emit_literal_done_repeat_emit_encodeBlockAsm10B: // matchLen XORL R12, R12 CMPL R9, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm10B + JL matchlen_match4_repeat_extend_encodeBlockAsm10B matchlen_loopback_repeat_extend_encodeBlockAsm10B: MOVQ (R10)(R12*1), R11 XORQ (SI)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm10B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm10B matchlen_loop_repeat_extend_encodeBlockAsm10B: LEAL -8(R9), R9 LEAL 8(R12), R12 CMPL R9, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B - -matchlen_single_repeat_extend_encodeBlockAsm10B: - TESTL R9, R9 - JZ repeat_extend_forward_end_encodeBlockAsm10B - -matchlen_single_loopback_repeat_extend_encodeBlockAsm10B: + JZ repeat_extend_forward_end_encodeBlockAsm10B + +matchlen_match4_repeat_extend_encodeBlockAsm10B: + CMPL R9, $0x04 + JL matchlen_match2_repeat_extend_encodeBlockAsm10B + MOVL (R10)(R12*1), R11 + CMPL (SI)(R12*1), R11 + JNE matchlen_match2_repeat_extend_encodeBlockAsm10B + SUBL $0x04, R9 + LEAL 4(R12), R12 + +matchlen_match2_repeat_extend_encodeBlockAsm10B: + CMPL R9, $0x02 + JL matchlen_match1_repeat_extend_encodeBlockAsm10B + MOVW (R10)(R12*1), R11 + CMPW (SI)(R12*1), R11 + JNE matchlen_match1_repeat_extend_encodeBlockAsm10B + SUBL $0x02, R9 + LEAL 2(R12), R12 + +matchlen_match1_repeat_extend_encodeBlockAsm10B: + CMPL R9, $0x01 + JL repeat_extend_forward_end_encodeBlockAsm10B MOVB (R10)(R12*1), R11 CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm10B LEAL 1(R12), R12 - DECL R9 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm10B repeat_extend_forward_end_encodeBlockAsm10B: ADDL R12, CX @@ -3389,6 +4035,65 @@ repeat_as_copy_encodeBlockAsm10B: two_byte_offset_repeat_as_copy_encodeBlockAsm10B: CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B + CMPL DI, $0x00000800 + JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B + MOVL $0x00000001, R8 + LEAL 16(R8), R8 + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, R8 + MOVB R8, (AX) + ADDQ $0x02, AX + SUBL $0x08, SI + + // emitRepeat + LEAL -4(SI), SI + JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b + CMPL R8, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b + CMPL DI, $0x00000800 + JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm10B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm10B + +long_offset_short_repeat_as_copy_encodeBlockAsm10B: MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(SI), SI @@ -3651,46 +4356,127 @@ match_nolit_loop_encodeBlockAsm10B: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm10B + JL matchlen_match4_match_nolit_encodeBlockAsm10B matchlen_loopback_match_nolit_encodeBlockAsm10B: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm10B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm10B matchlen_loop_match_nolit_encodeBlockAsm10B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm10B - -matchlen_single_match_nolit_encodeBlockAsm10B: - TESTL DI, DI - JZ match_nolit_end_encodeBlockAsm10B - -matchlen_single_loopback_match_nolit_encodeBlockAsm10B: + JZ match_nolit_end_encodeBlockAsm10B + +matchlen_match4_match_nolit_encodeBlockAsm10B: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeBlockAsm10B + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeBlockAsm10B + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeBlockAsm10B: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeBlockAsm10B + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeBlockAsm10B + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeBlockAsm10B: + CMPL DI, $0x01 + JL match_nolit_end_encodeBlockAsm10B MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm10B LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10B -match_nolit_end_encodeBlockAsm10B: - ADDL R10, CX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL CX, 12(SP) +match_nolit_end_encodeBlockAsm10B: + ADDL R10, CX + MOVL 16(SP), SI + ADDL $0x04, R10 + MOVL CX, 12(SP) + + // emitCopy +two_byte_offset_match_nolit_encodeBlockAsm10B: + CMPL R10, $0x40 + JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B + CMPL SI, $0x00000800 + JAE long_offset_short_match_nolit_encodeBlockAsm10B + MOVL $0x00000001, DI + LEAL 16(DI), DI + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, DI + MOVB DI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R10 + + // emitRepeat + LEAL -4(R10), R10 + JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b + MOVL R10, DI + LEAL -4(R10), R10 + CMPL DI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b + CMPL SI, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm10B - // emitCopy -two_byte_offset_match_nolit_encodeBlockAsm10B: - CMPL R10, $0x40 - JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B +long_offset_short_match_nolit_encodeBlockAsm10B: MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(R10), R10 @@ -3839,17 +4625,36 @@ memmove_emit_remainder_encodeBlockAsm10B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: @@ -3932,7 +4737,7 @@ emit_literal_done_emit_remainder_encodeBlockAsm10B: RET // func encodeBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBlockAsm8B(SB), $1048-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000008, CX @@ -4149,35 +4954,57 @@ emit_literal_done_repeat_emit_encodeBlockAsm8B: // matchLen XORL R12, R12 CMPL R9, $0x08 - JL matchlen_single_repeat_extend_encodeBlockAsm8B + JL matchlen_match4_repeat_extend_encodeBlockAsm8B matchlen_loopback_repeat_extend_encodeBlockAsm8B: MOVQ (R10)(R12*1), R11 XORQ (SI)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_repeat_extend_encodeBlockAsm8B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP repeat_extend_forward_end_encodeBlockAsm8B matchlen_loop_repeat_extend_encodeBlockAsm8B: LEAL -8(R9), R9 LEAL 8(R12), R12 CMPL R9, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B - -matchlen_single_repeat_extend_encodeBlockAsm8B: - TESTL R9, R9 - JZ repeat_extend_forward_end_encodeBlockAsm8B - -matchlen_single_loopback_repeat_extend_encodeBlockAsm8B: + JZ repeat_extend_forward_end_encodeBlockAsm8B + +matchlen_match4_repeat_extend_encodeBlockAsm8B: + CMPL R9, $0x04 + JL matchlen_match2_repeat_extend_encodeBlockAsm8B + MOVL (R10)(R12*1), R11 + CMPL (SI)(R12*1), R11 + JNE matchlen_match2_repeat_extend_encodeBlockAsm8B + SUBL $0x04, R9 + LEAL 4(R12), R12 + +matchlen_match2_repeat_extend_encodeBlockAsm8B: + CMPL R9, $0x02 + JL matchlen_match1_repeat_extend_encodeBlockAsm8B + MOVW (R10)(R12*1), R11 + CMPW (SI)(R12*1), R11 + JNE matchlen_match1_repeat_extend_encodeBlockAsm8B + SUBL $0x02, R9 + LEAL 2(R12), R12 + +matchlen_match1_repeat_extend_encodeBlockAsm8B: + CMPL R9, $0x01 + JL repeat_extend_forward_end_encodeBlockAsm8B MOVB (R10)(R12*1), R11 CMPB (SI)(R12*1), R11 JNE repeat_extend_forward_end_encodeBlockAsm8B LEAL 1(R12), R12 - DECL R9 - JNZ matchlen_single_loopback_repeat_extend_encodeBlockAsm8B repeat_extend_forward_end_encodeBlockAsm8B: ADDL R12, CX @@ -4232,6 +5059,61 @@ repeat_as_copy_encodeBlockAsm8B: two_byte_offset_repeat_as_copy_encodeBlockAsm8B: CMPL SI, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B + CMPL DI, $0x00000800 + JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B + MOVL $0x00000001, R8 + LEAL 16(R8), R8 + MOVB DI, 1(AX) + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, R8 + MOVB R8, (AX) + ADDQ $0x02, AX + SUBL $0x08, SI + + // emitRepeat + LEAL -4(SI), SI + JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b + MOVL SI, DI + LEAL -4(SI), SI + CMPL DI, $0x08 + JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b + CMPL DI, $0x0c + JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: + CMPL SI, $0x00000104 + JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBlockAsm8B + +repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBlockAsm8B + +long_offset_short_repeat_as_copy_encodeBlockAsm8B: MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(SI), SI @@ -4488,35 +5370,57 @@ match_nolit_loop_encodeBlockAsm8B: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeBlockAsm8B + JL matchlen_match4_match_nolit_encodeBlockAsm8B matchlen_loopback_match_nolit_encodeBlockAsm8B: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeBlockAsm8B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeBlockAsm8B matchlen_loop_match_nolit_encodeBlockAsm8B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm8B - -matchlen_single_match_nolit_encodeBlockAsm8B: - TESTL DI, DI - JZ match_nolit_end_encodeBlockAsm8B - -matchlen_single_loopback_match_nolit_encodeBlockAsm8B: + JZ match_nolit_end_encodeBlockAsm8B + +matchlen_match4_match_nolit_encodeBlockAsm8B: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeBlockAsm8B + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeBlockAsm8B + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeBlockAsm8B: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeBlockAsm8B + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeBlockAsm8B + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeBlockAsm8B: + CMPL DI, $0x01 + JL match_nolit_end_encodeBlockAsm8B MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeBlockAsm8B LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8B match_nolit_end_encodeBlockAsm8B: ADDL R10, CX @@ -4528,6 +5432,61 @@ match_nolit_end_encodeBlockAsm8B: two_byte_offset_match_nolit_encodeBlockAsm8B: CMPL R10, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B + CMPL SI, $0x00000800 + JAE long_offset_short_match_nolit_encodeBlockAsm8B + MOVL $0x00000001, DI + LEAL 16(DI), DI + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, DI + MOVB DI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R10 + + // emitRepeat + LEAL -4(R10), R10 + JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b + MOVL R10, SI + LEAL -4(R10), R10 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: + CMPL R10, $0x00000104 + JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + XORQ DI, DI + LEAL 1(DI)(R10*4), R10 + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBlockAsm8B + +long_offset_short_match_nolit_encodeBlockAsm8B: MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(R10), R10 @@ -4670,17 +5629,36 @@ memmove_emit_remainder_encodeBlockAsm8B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: @@ -4763,10 +5741,10 @@ emit_literal_done_emit_remainder_encodeBlockAsm8B: RET // func encodeBetterBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeBetterBlockAsm(SB), $327704-56 +// Requires: BMI, SSE2 +TEXT ·encodeBetterBlockAsm(SB), $589848-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX + MOVQ $0x00001200, CX LEAQ 24(SP), DX PXOR X0, X0 @@ -4818,27 +5796,37 @@ check_maxskip_cont_encodeBetterBlockAsm: MOVQ DI, R11 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 + MOVL 524312(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVL CX, 524312(SP)(R11*4) + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm + CMPQ R11, DI + JNE no_short_found_encodeBetterBlockAsm + MOVL R8, SI + JMP candidate_match_encodeBetterBlockAsm + +no_short_found_encodeBetterBlockAsm: + CMPL R10, DI + JEQ candidate_match_encodeBetterBlockAsm + CMPL R11, DI + JEQ candidateS_match_encodeBetterBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm candidateS_match_encodeBetterBlockAsm: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 MOVL 24(SP)(R10*4), SI INCL CX MOVL CX, 24(SP)(R10*4) @@ -4885,35 +5873,57 @@ match_dst_size_check_encodeBetterBlockAsm: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm + JL matchlen_match4_match_nolit_encodeBetterBlockAsm matchlen_loopback_match_nolit_encodeBetterBlockAsm: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm matchlen_loop_match_nolit_encodeBetterBlockAsm: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm - -matchlen_single_match_nolit_encodeBetterBlockAsm: - TESTL R8, R8 - JZ match_nolit_end_encodeBetterBlockAsm - -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm: + JZ match_nolit_end_encodeBetterBlockAsm + +matchlen_match4_match_nolit_encodeBetterBlockAsm: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeBetterBlockAsm + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeBetterBlockAsm + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeBetterBlockAsm: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeBetterBlockAsm + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeBetterBlockAsm + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeBetterBlockAsm: + CMPL R8, $0x01 + JL match_nolit_end_encodeBetterBlockAsm MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm match_nolit_end_encodeBetterBlockAsm: MOVL CX, R8 @@ -5179,6 +6189,90 @@ four_bytes_remain_match_nolit_encodeBetterBlockAsm: two_byte_offset_match_nolit_encodeBetterBlockAsm: CMPL R12, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm + CMPL R8, $0x00000800 + JAE long_offset_short_match_nolit_encodeBetterBlockAsm + MOVL $0x00000001, SI + LEAL 16(SI), SI + MOVB R8, 1(AX) + MOVL R8, R9 + SHRL $0x08, R9 + SHLL $0x05, R9 + ORL R9, SI + MOVB SI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R12 + + // emitRepeat + LEAL -4(R12), R12 + JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + +emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + CMPL R12, $0x0100ffff + JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + LEAL -16842747(R12), R12 + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + +repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm + +long_offset_short_match_nolit_encodeBetterBlockAsm: MOVB $0xee, (AX) MOVW R8, 1(AX) LEAL -60(R12), R12 @@ -5505,52 +6599,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm: match_nolit_dst_ok_encodeBetterBlockAsm: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 + SHLQ $0x08, R12 + IMULQ SI, R12 + SHRQ $0x2f, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x32, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 524312(SP)(R11*4) + MOVL R14, 524312(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeBetterBlockAsm: + CMPQ DI, R9 + JAE search_loop_encodeBetterBlockAsm + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x08, R8 + IMULQ SI, R8 + SHRQ $0x2f, R8 SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R10 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeBetterBlockAsm emit_remainder_encodeBetterBlockAsm: MOVQ src_len+32(FP), CX @@ -5617,8 +6708,9 @@ memmove_emit_remainder_encodeBetterBlockAsm: MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4 + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 @@ -5627,9 +6719,18 @@ memmove_emit_remainder_encodeBetterBlockAsm: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: @@ -5719,10 +6820,10 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm: RET // func encodeBetterBlockAsm4MB(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56 +// Requires: BMI, SSE2 +TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX + MOVQ $0x00001200, CX LEAQ 24(SP), DX PXOR X0, X0 @@ -5774,27 +6875,37 @@ check_maxskip_cont_encodeBetterBlockAsm4MB: MOVQ DI, R11 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 + MOVL 524312(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVL CX, 524312(SP)(R11*4) + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm4MB - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm4MB - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm4MB + CMPQ R11, DI + JNE no_short_found_encodeBetterBlockAsm4MB + MOVL R8, SI + JMP candidate_match_encodeBetterBlockAsm4MB + +no_short_found_encodeBetterBlockAsm4MB: + CMPL R10, DI + JEQ candidate_match_encodeBetterBlockAsm4MB + CMPL R11, DI + JEQ candidateS_match_encodeBetterBlockAsm4MB + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm4MB candidateS_match_encodeBetterBlockAsm4MB: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 MOVL 24(SP)(R10*4), SI INCL CX MOVL CX, 24(SP)(R10*4) @@ -5841,35 +6952,57 @@ match_dst_size_check_encodeBetterBlockAsm4MB: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm4MB + JL matchlen_match4_match_nolit_encodeBetterBlockAsm4MB matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm4MB + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm4MB matchlen_loop_match_nolit_encodeBetterBlockAsm4MB: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB - -matchlen_single_match_nolit_encodeBetterBlockAsm4MB: - TESTL R8, R8 - JZ match_nolit_end_encodeBetterBlockAsm4MB - -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB: + JZ match_nolit_end_encodeBetterBlockAsm4MB + +matchlen_match4_match_nolit_encodeBetterBlockAsm4MB: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeBetterBlockAsm4MB + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeBetterBlockAsm4MB: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeBetterBlockAsm4MB + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeBetterBlockAsm4MB: + CMPL R8, $0x01 + JL match_nolit_end_encodeBetterBlockAsm4MB MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm4MB LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB match_nolit_end_encodeBetterBlockAsm4MB: MOVL CX, R8 @@ -6116,6 +7249,77 @@ four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB: two_byte_offset_match_nolit_encodeBetterBlockAsm4MB: CMPL R12, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB + CMPL R8, $0x00000800 + JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB + MOVL $0x00000001, SI + LEAL 16(SI), SI + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, SI + MOVB SI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R12 + + // emitRepeat + LEAL -4(R12), R12 + JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b + CMPL R12, $0x00010100 + JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b + LEAL -65536(R12), R12 + MOVL R12, R8 + MOVW $0x001d, (AX) + MOVW R12, 2(AX) + SARL $0x10, R8 + MOVB R8, 4(AX) + ADDQ $0x05, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +long_offset_short_match_nolit_encodeBetterBlockAsm4MB: MOVB $0xee, (AX) MOVW R8, 1(AX) LEAL -60(R12), R12 @@ -6412,52 +7616,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: match_nolit_dst_ok_encodeBetterBlockAsm4MB: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 + SHLQ $0x08, R12 + IMULQ SI, R12 + SHRQ $0x2f, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x32, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 524312(SP)(R11*4) + MOVL R14, 524312(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeBetterBlockAsm4MB: + CMPQ DI, R9 + JAE search_loop_encodeBetterBlockAsm4MB + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x08, R8 + IMULQ SI, R8 + SHRQ $0x2f, R8 SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R10 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm4MB + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeBetterBlockAsm4MB emit_remainder_encodeBetterBlockAsm4MB: MOVQ src_len+32(FP), CX @@ -6516,8 +7717,9 @@ memmove_emit_remainder_encodeBetterBlockAsm4MB: MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4 + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 CMPQ BX, $0x10 @@ -6526,9 +7728,18 @@ memmove_emit_remainder_encodeBetterBlockAsm4MB: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: @@ -6618,7 +7829,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB: RET // func encodeBetterBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm12B(SB), $81944-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000280, CX @@ -6673,12 +7884,22 @@ search_loop_encodeBetterBlockAsm12B: MOVL 65560(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 65560(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm12B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm12B + CMPQ R11, DI + JNE no_short_found_encodeBetterBlockAsm12B + MOVL R8, SI + JMP candidate_match_encodeBetterBlockAsm12B + +no_short_found_encodeBetterBlockAsm12B: + CMPL R10, DI + JEQ candidate_match_encodeBetterBlockAsm12B + CMPL R11, DI + JEQ candidateS_match_encodeBetterBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm12B candidateS_match_encodeBetterBlockAsm12B: SHRQ $0x08, DI @@ -6732,35 +7953,57 @@ match_dst_size_check_encodeBetterBlockAsm12B: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm12B + JL matchlen_match4_match_nolit_encodeBetterBlockAsm12B matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm12B matchlen_loop_match_nolit_encodeBetterBlockAsm12B: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B - -matchlen_single_match_nolit_encodeBetterBlockAsm12B: - TESTL R8, R8 - JZ match_nolit_end_encodeBetterBlockAsm12B - -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B: + JZ match_nolit_end_encodeBetterBlockAsm12B + +matchlen_match4_match_nolit_encodeBetterBlockAsm12B: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeBetterBlockAsm12B + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeBetterBlockAsm12B: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeBetterBlockAsm12B + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeBetterBlockAsm12B: + CMPL R8, $0x01 + JL match_nolit_end_encodeBetterBlockAsm12B MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm12B LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B match_nolit_end_encodeBetterBlockAsm12B: MOVL CX, R8 @@ -6907,6 +8150,65 @@ emit_literal_done_match_emit_encodeBetterBlockAsm12B: two_byte_offset_match_nolit_encodeBetterBlockAsm12B: CMPL R12, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B + CMPL R8, $0x00000800 + JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B + MOVL $0x00000001, SI + LEAL 16(SI), SI + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, SI + MOVB SI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R12 + + // emitRepeat + LEAL -4(R12), R12 + JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +long_offset_short_match_nolit_encodeBetterBlockAsm12B: MOVB $0xee, (AX) MOVW R8, 1(AX) LEAL -60(R12), R12 @@ -7168,52 +8470,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm12B: match_nolit_dst_ok_encodeBetterBlockAsm12B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x34, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x34, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x32, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x34, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 65560(SP)(R11*4) - MOVL R15, 65560(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 65560(SP)(R11*4) + MOVL R14, 65560(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeBetterBlockAsm12B: + CMPQ DI, R9 + JAE search_loop_encodeBetterBlockAsm12B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x32, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 65560(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm12B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeBetterBlockAsm12B emit_remainder_encodeBetterBlockAsm12B: MOVQ src_len+32(FP), CX @@ -7261,8 +8560,9 @@ memmove_emit_remainder_encodeBetterBlockAsm12B: MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4 + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 @@ -7271,9 +8571,18 @@ memmove_emit_remainder_encodeBetterBlockAsm12B: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: @@ -7363,7 +8672,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: RET // func encodeBetterBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 MOVQ dst_base+0(FP), AX MOVQ $0x000000a0, CX @@ -7418,12 +8727,22 @@ search_loop_encodeBetterBlockAsm10B: MOVL 16408(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 16408(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm10B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm10B + CMPQ R11, DI + JNE no_short_found_encodeBetterBlockAsm10B + MOVL R8, SI + JMP candidate_match_encodeBetterBlockAsm10B + +no_short_found_encodeBetterBlockAsm10B: + CMPL R10, DI + JEQ candidate_match_encodeBetterBlockAsm10B + CMPL R11, DI + JEQ candidateS_match_encodeBetterBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm10B candidateS_match_encodeBetterBlockAsm10B: SHRQ $0x08, DI @@ -7477,35 +8796,57 @@ match_dst_size_check_encodeBetterBlockAsm10B: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm10B + JL matchlen_match4_match_nolit_encodeBetterBlockAsm10B matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm10B matchlen_loop_match_nolit_encodeBetterBlockAsm10B: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B - -matchlen_single_match_nolit_encodeBetterBlockAsm10B: - TESTL R8, R8 - JZ match_nolit_end_encodeBetterBlockAsm10B - -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B: + JZ match_nolit_end_encodeBetterBlockAsm10B + +matchlen_match4_match_nolit_encodeBetterBlockAsm10B: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeBetterBlockAsm10B + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeBetterBlockAsm10B: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeBetterBlockAsm10B + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeBetterBlockAsm10B: + CMPL R8, $0x01 + JL match_nolit_end_encodeBetterBlockAsm10B MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm10B LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B match_nolit_end_encodeBetterBlockAsm10B: MOVL CX, R8 @@ -7652,6 +8993,65 @@ emit_literal_done_match_emit_encodeBetterBlockAsm10B: two_byte_offset_match_nolit_encodeBetterBlockAsm10B: CMPL R12, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B + CMPL R8, $0x00000800 + JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B + MOVL $0x00000001, SI + LEAL 16(SI), SI + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, SI + MOVB SI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R12 + + // emitRepeat + LEAL -4(R12), R12 + JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b + CMPL R8, $0x00000800 + JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +long_offset_short_match_nolit_encodeBetterBlockAsm10B: MOVB $0xee, (AX) MOVW R8, 1(AX) LEAL -60(R12), R12 @@ -7913,52 +9313,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm10B: match_nolit_dst_ok_encodeBetterBlockAsm10B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x36, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x34, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x36, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 16408(SP)(R11*4) - MOVL R15, 16408(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 16408(SP)(R11*4) + MOVL R14, 16408(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeBetterBlockAsm10B: + CMPQ DI, R9 + JAE search_loop_encodeBetterBlockAsm10B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x34, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 16408(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm10B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeBetterBlockAsm10B emit_remainder_encodeBetterBlockAsm10B: MOVQ src_len+32(FP), CX @@ -8006,8 +9403,9 @@ memmove_emit_remainder_encodeBetterBlockAsm10B: MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4 + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 @@ -8016,9 +9414,18 @@ memmove_emit_remainder_encodeBetterBlockAsm10B: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: @@ -8108,7 +9515,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: RET // func encodeBetterBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm8B(SB), $5144-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000028, CX @@ -8163,12 +9570,22 @@ search_loop_encodeBetterBlockAsm8B: MOVL 4120(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 4120(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm8B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm8B + CMPQ R11, DI + JNE no_short_found_encodeBetterBlockAsm8B + MOVL R8, SI + JMP candidate_match_encodeBetterBlockAsm8B + +no_short_found_encodeBetterBlockAsm8B: + CMPL R10, DI + JEQ candidate_match_encodeBetterBlockAsm8B + CMPL R11, DI + JEQ candidateS_match_encodeBetterBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm8B candidateS_match_encodeBetterBlockAsm8B: SHRQ $0x08, DI @@ -8222,35 +9639,57 @@ match_dst_size_check_encodeBetterBlockAsm8B: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeBetterBlockAsm8B + JL matchlen_match4_match_nolit_encodeBetterBlockAsm8B matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeBetterBlockAsm8B matchlen_loop_match_nolit_encodeBetterBlockAsm8B: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B - -matchlen_single_match_nolit_encodeBetterBlockAsm8B: - TESTL R8, R8 - JZ match_nolit_end_encodeBetterBlockAsm8B - -matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B: + JZ match_nolit_end_encodeBetterBlockAsm8B + +matchlen_match4_match_nolit_encodeBetterBlockAsm8B: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeBetterBlockAsm8B + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeBetterBlockAsm8B: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeBetterBlockAsm8B + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeBetterBlockAsm8B: + CMPL R8, $0x01 + JL match_nolit_end_encodeBetterBlockAsm8B MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeBetterBlockAsm8B LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B match_nolit_end_encodeBetterBlockAsm8B: MOVL CX, R8 @@ -8397,6 +9836,61 @@ emit_literal_done_match_emit_encodeBetterBlockAsm8B: two_byte_offset_match_nolit_encodeBetterBlockAsm8B: CMPL R12, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B + CMPL R8, $0x00000800 + JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B + MOVL $0x00000001, SI + LEAL 16(SI), SI + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, SI + MOVB SI, (AX) + ADDQ $0x02, AX + SUBL $0x08, R12 + + // emitRepeat + LEAL -4(R12), R12 + JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b + MOVL R12, SI + LEAL -4(R12), R12 + CMPL SI, $0x08 + JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b + CMPL SI, $0x0c + JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: + CMPL R12, $0x00000104 + JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b + LEAL -256(R12), R12 + MOVW $0x0019, (AX) + MOVW R12, 2(AX) + ADDQ $0x04, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: + LEAL -4(R12), R12 + MOVW $0x0015, (AX) + MOVB R12, 2(AX) + ADDQ $0x03, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: + SHLL $0x02, R12 + ORL $0x01, R12 + MOVW R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + XORQ SI, SI + LEAL 1(SI)(R12*4), R12 + MOVB R8, 1(AX) + SARL $0x08, R8 + SHLL $0x05, R8 + ORL R8, R12 + MOVB R12, (AX) + ADDQ $0x02, AX + JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +long_offset_short_match_nolit_encodeBetterBlockAsm8B: MOVB $0xee, (AX) MOVW R8, 1(AX) LEAL -60(R12), R12 @@ -8648,52 +10142,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm8B: match_nolit_dst_ok_encodeBetterBlockAsm8B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x38, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x36, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x38, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 4120(SP)(R11*4) - MOVL R15, 4120(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 4120(SP)(R11*4) + MOVL R14, 4120(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeBetterBlockAsm8B: + CMPQ DI, R9 + JAE search_loop_encodeBetterBlockAsm8B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x36, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 4120(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm8B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeBetterBlockAsm8B emit_remainder_encodeBetterBlockAsm8B: MOVQ src_len+32(FP), CX @@ -8741,8 +10232,9 @@ memmove_emit_remainder_encodeBetterBlockAsm8B: MOVL SI, BX // genMemMoveShort - CMPQ BX, $0x04 - JLE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4 + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 @@ -8751,9 +10243,18 @@ memmove_emit_remainder_encodeBetterBlockAsm8B: JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4: - MOVL (CX), SI - MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: @@ -8843,7 +10344,7 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: RET // func encodeSnappyBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX @@ -9079,35 +10580,57 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm: // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm + JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm matchlen_loopback_repeat_extend_encodeSnappyBlockAsm: MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm matchlen_loop_repeat_extend_encodeSnappyBlockAsm: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm - -matchlen_single_repeat_extend_encodeSnappyBlockAsm: - TESTL R8, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm - -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm: + JZ repeat_extend_forward_end_encodeSnappyBlockAsm + +matchlen_match4_repeat_extend_encodeSnappyBlockAsm: + CMPL R8, $0x04 + JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm + SUBL $0x04, R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeSnappyBlockAsm: + CMPL R8, $0x02 + JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm + SUBL $0x02, R8 + LEAL 2(R11), R11 + +matchlen_match1_repeat_extend_encodeSnappyBlockAsm: + CMPL R8, $0x01 + JL repeat_extend_forward_end_encodeSnappyBlockAsm MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm LEAL 1(R11), R11 - DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm repeat_extend_forward_end_encodeSnappyBlockAsm: ADDL R11, CX @@ -9380,35 +10903,57 @@ match_nolit_loop_encodeSnappyBlockAsm: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm + JL matchlen_match4_match_nolit_encodeSnappyBlockAsm matchlen_loopback_match_nolit_encodeSnappyBlockAsm: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm matchlen_loop_match_nolit_encodeSnappyBlockAsm: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm - -matchlen_single_match_nolit_encodeSnappyBlockAsm: - TESTL DI, DI - JZ match_nolit_end_encodeSnappyBlockAsm - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm: + JZ match_nolit_end_encodeSnappyBlockAsm + +matchlen_match4_match_nolit_encodeSnappyBlockAsm: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBlockAsm + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeSnappyBlockAsm: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBlockAsm + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeSnappyBlockAsm: + CMPL DI, $0x01 + JL match_nolit_end_encodeSnappyBlockAsm MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm match_nolit_end_encodeSnappyBlockAsm: ADDL R10, CX @@ -9567,17 +11112,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: @@ -9660,7 +11224,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm: RET // func encodeSnappyBlockAsm64K(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX @@ -9877,35 +11441,57 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K: // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm64K + JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K: MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K - -matchlen_single_repeat_extend_encodeSnappyBlockAsm64K: - TESTL R8, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K - -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K: + JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K + +matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K: + CMPL R8, $0x04 + JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K + SUBL $0x04, R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K: + CMPL R8, $0x02 + JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K + SUBL $0x02, R8 + LEAL 2(R11), R11 + +matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K: + CMPL R8, $0x01 + JL repeat_extend_forward_end_encodeSnappyBlockAsm64K MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K LEAL 1(R11), R11 - DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K repeat_extend_forward_end_encodeSnappyBlockAsm64K: ADDL R11, CX @@ -10135,35 +11721,57 @@ match_nolit_loop_encodeSnappyBlockAsm64K: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm64K + JL matchlen_match4_match_nolit_encodeSnappyBlockAsm64K matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm64K + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm64K matchlen_loop_match_nolit_encodeSnappyBlockAsm64K: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K - -matchlen_single_match_nolit_encodeSnappyBlockAsm64K: - TESTL DI, DI - JZ match_nolit_end_encodeSnappyBlockAsm64K - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K: + JZ match_nolit_end_encodeSnappyBlockAsm64K + +matchlen_match4_match_nolit_encodeSnappyBlockAsm64K: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBlockAsm64K + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeSnappyBlockAsm64K: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBlockAsm64K + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeSnappyBlockAsm64K: + CMPL DI, $0x01 + JL match_nolit_end_encodeSnappyBlockAsm64K MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm64K LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K match_nolit_end_encodeSnappyBlockAsm64K: ADDL R10, CX @@ -10279,17 +11887,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm64K: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16: @@ -10372,7 +11999,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K: RET // func encodeSnappyBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000080, CX @@ -10589,35 +12216,57 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm12B + JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B - -matchlen_single_repeat_extend_encodeSnappyBlockAsm12B: - TESTL R8, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B - -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B: + JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B + +matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B: + CMPL R8, $0x04 + JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B + SUBL $0x04, R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B: + CMPL R8, $0x02 + JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B + SUBL $0x02, R8 + LEAL 2(R11), R11 + +matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B: + CMPL R8, $0x01 + JL repeat_extend_forward_end_encodeSnappyBlockAsm12B MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B LEAL 1(R11), R11 - DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B repeat_extend_forward_end_encodeSnappyBlockAsm12B: ADDL R11, CX @@ -10847,35 +12496,57 @@ match_nolit_loop_encodeSnappyBlockAsm12B: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm12B + JL matchlen_match4_match_nolit_encodeSnappyBlockAsm12B matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm12B matchlen_loop_match_nolit_encodeSnappyBlockAsm12B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B - -matchlen_single_match_nolit_encodeSnappyBlockAsm12B: - TESTL DI, DI - JZ match_nolit_end_encodeSnappyBlockAsm12B - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B: + JZ match_nolit_end_encodeSnappyBlockAsm12B + +matchlen_match4_match_nolit_encodeSnappyBlockAsm12B: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBlockAsm12B + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeSnappyBlockAsm12B: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBlockAsm12B + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeSnappyBlockAsm12B: + CMPL DI, $0x01 + JL match_nolit_end_encodeSnappyBlockAsm12B MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm12B LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B match_nolit_end_encodeSnappyBlockAsm12B: ADDL R10, CX @@ -10991,17 +12662,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm12B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: @@ -11084,7 +12774,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B: RET // func encodeSnappyBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000020, CX @@ -11301,35 +12991,57 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm10B + JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B - -matchlen_single_repeat_extend_encodeSnappyBlockAsm10B: - TESTL R8, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B - -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B: + JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B + +matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B: + CMPL R8, $0x04 + JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B + SUBL $0x04, R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B: + CMPL R8, $0x02 + JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B + SUBL $0x02, R8 + LEAL 2(R11), R11 + +matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B: + CMPL R8, $0x01 + JL repeat_extend_forward_end_encodeSnappyBlockAsm10B MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B LEAL 1(R11), R11 - DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B repeat_extend_forward_end_encodeSnappyBlockAsm10B: ADDL R11, CX @@ -11559,35 +13271,57 @@ match_nolit_loop_encodeSnappyBlockAsm10B: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm10B + JL matchlen_match4_match_nolit_encodeSnappyBlockAsm10B matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm10B matchlen_loop_match_nolit_encodeSnappyBlockAsm10B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B - -matchlen_single_match_nolit_encodeSnappyBlockAsm10B: - TESTL DI, DI - JZ match_nolit_end_encodeSnappyBlockAsm10B - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B: + JZ match_nolit_end_encodeSnappyBlockAsm10B + +matchlen_match4_match_nolit_encodeSnappyBlockAsm10B: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBlockAsm10B + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeSnappyBlockAsm10B: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBlockAsm10B + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeSnappyBlockAsm10B: + CMPL DI, $0x01 + JL match_nolit_end_encodeSnappyBlockAsm10B MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm10B LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B match_nolit_end_encodeSnappyBlockAsm10B: ADDL R10, CX @@ -11703,17 +13437,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm10B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: @@ -11796,7 +13549,7 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B: RET // func encodeSnappyBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000008, CX @@ -12013,35 +13766,57 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: // matchLen XORL R11, R11 CMPL R8, $0x08 - JL matchlen_single_repeat_extend_encodeSnappyBlockAsm8B + JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: MOVQ (R9)(R11*1), R10 XORQ (SI)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B - BSFQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B - -matchlen_single_repeat_extend_encodeSnappyBlockAsm8B: - TESTL R8, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B - -matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B: + JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B + +matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B: + CMPL R8, $0x04 + JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B + SUBL $0x04, R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B: + CMPL R8, $0x02 + JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B + SUBL $0x02, R8 + LEAL 2(R11), R11 + +matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B: + CMPL R8, $0x01 + JL repeat_extend_forward_end_encodeSnappyBlockAsm8B MOVB (R9)(R11*1), R10 CMPB (SI)(R11*1), R10 JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B LEAL 1(R11), R11 - DECL R8 - JNZ matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B repeat_extend_forward_end_encodeSnappyBlockAsm8B: ADDL R11, CX @@ -12269,35 +14044,57 @@ match_nolit_loop_encodeSnappyBlockAsm8B: // matchLen XORL R10, R10 CMPL DI, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBlockAsm8B + JL matchlen_match4_match_nolit_encodeSnappyBlockAsm8B matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: MOVQ (R8)(R10*1), R9 XORQ (SI)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B - BSFQ R9, R9 - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP match_nolit_end_encodeSnappyBlockAsm8B matchlen_loop_match_nolit_encodeSnappyBlockAsm8B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B - -matchlen_single_match_nolit_encodeSnappyBlockAsm8B: - TESTL DI, DI - JZ match_nolit_end_encodeSnappyBlockAsm8B - -matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B: + JZ match_nolit_end_encodeSnappyBlockAsm8B + +matchlen_match4_match_nolit_encodeSnappyBlockAsm8B: + CMPL DI, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBlockAsm8B + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 + JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B + SUBL $0x04, DI + LEAL 4(R10), R10 + +matchlen_match2_match_nolit_encodeSnappyBlockAsm8B: + CMPL DI, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBlockAsm8B + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 + JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B + SUBL $0x02, DI + LEAL 2(R10), R10 + +matchlen_match1_match_nolit_encodeSnappyBlockAsm8B: + CMPL DI, $0x01 + JL match_nolit_end_encodeSnappyBlockAsm8B MOVB (R8)(R10*1), R9 CMPB (SI)(R10*1), R9 JNE match_nolit_end_encodeSnappyBlockAsm8B LEAL 1(R10), R10 - DECL DI - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B match_nolit_end_encodeSnappyBlockAsm8B: ADDL R10, CX @@ -12411,17 +14208,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm8B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: @@ -12504,10 +14320,10 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: RET // func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int -// Requires: SSE2 -TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56 +// Requires: BMI, SSE2 +TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX + MOVQ $0x00001200, CX LEAQ 24(SP), DX PXOR X0, X0 @@ -12559,27 +14375,37 @@ check_maxskip_cont_encodeSnappyBetterBlockAsm: MOVQ DI, R11 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 + MOVL 524312(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVL CX, 524312(SP)(R11*4) + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm + CMPQ R11, DI + JNE no_short_found_encodeSnappyBetterBlockAsm + MOVL R8, SI + JMP candidate_match_encodeSnappyBetterBlockAsm + +no_short_found_encodeSnappyBetterBlockAsm: + CMPL R10, DI + JEQ candidate_match_encodeSnappyBetterBlockAsm + CMPL R11, DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm candidateS_match_encodeSnappyBetterBlockAsm: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 MOVL 24(SP)(R10*4), SI INCL CX MOVL CX, 24(SP)(R10*4) @@ -12626,35 +14452,57 @@ match_dst_size_check_encodeSnappyBetterBlockAsm: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm + JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm - -matchlen_single_match_nolit_encodeSnappyBetterBlockAsm: - TESTL R8, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm - -matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm: + JZ match_nolit_end_encodeSnappyBetterBlockAsm + +matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm: + CMPL R8, $0x01 + JL match_nolit_end_encodeSnappyBetterBlockAsm MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeSnappyBetterBlockAsm LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm match_nolit_end_encodeSnappyBetterBlockAsm: MOVL CX, R8 @@ -12881,52 +14729,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm: match_nolit_dst_ok_encodeSnappyBetterBlockAsm: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 + SHLQ $0x08, R12 + IMULQ SI, R12 + SHRQ $0x2f, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x32, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 524312(SP)(R11*4) + MOVL R14, 524312(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm: + CMPQ DI, R9 + JAE search_loop_encodeSnappyBetterBlockAsm + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x08, R8 + IMULQ SI, R8 + SHRQ $0x2f, R8 SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R10 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeSnappyBetterBlockAsm emit_remainder_encodeSnappyBetterBlockAsm: MOVQ src_len+32(FP), CX @@ -12993,17 +14838,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16: @@ -13086,7 +14950,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm: RET // func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000a00, CX @@ -13141,12 +15005,22 @@ search_loop_encodeSnappyBetterBlockAsm64K: MOVL 262168(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm64K - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm64K - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm64K + CMPQ R11, DI + JNE no_short_found_encodeSnappyBetterBlockAsm64K + MOVL R8, SI + JMP candidate_match_encodeSnappyBetterBlockAsm64K + +no_short_found_encodeSnappyBetterBlockAsm64K: + CMPL R10, DI + JEQ candidate_match_encodeSnappyBetterBlockAsm64K + CMPL R11, DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm64K + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm64K candidateS_match_encodeSnappyBetterBlockAsm64K: SHRQ $0x08, DI @@ -13200,35 +15074,57 @@ match_dst_size_check_encodeSnappyBetterBlockAsm64K: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K + JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm64K + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm64K matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K - -matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K: - TESTL R8, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm64K - -matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: + JZ match_nolit_end_encodeSnappyBetterBlockAsm64K + +matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K: + CMPL R8, $0x01 + JL match_nolit_end_encodeSnappyBetterBlockAsm64K MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeSnappyBetterBlockAsm64K LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K match_nolit_end_encodeSnappyBetterBlockAsm64K: MOVL CX, R8 @@ -13403,52 +15299,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K: match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 + SHLQ $0x08, R12 + IMULQ SI, R12 + SHRQ $0x30, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x32, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 262168(SP)(R11*4) + MOVL R14, 262168(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm64K: + CMPQ DI, R9 + JAE search_loop_encodeSnappyBetterBlockAsm64K + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x08, R8 + IMULQ SI, R8 + SHRQ $0x30, R8 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm64K + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeSnappyBetterBlockAsm64K emit_remainder_encodeSnappyBetterBlockAsm64K: MOVQ src_len+32(FP), CX @@ -13496,17 +15389,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm64K: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: @@ -13589,7 +15501,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K: RET // func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000280, CX @@ -13644,12 +15556,22 @@ search_loop_encodeSnappyBetterBlockAsm12B: MOVL 65560(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 65560(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm12B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm12B + CMPQ R11, DI + JNE no_short_found_encodeSnappyBetterBlockAsm12B + MOVL R8, SI + JMP candidate_match_encodeSnappyBetterBlockAsm12B + +no_short_found_encodeSnappyBetterBlockAsm12B: + CMPL R10, DI + JEQ candidate_match_encodeSnappyBetterBlockAsm12B + CMPL R11, DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm12B candidateS_match_encodeSnappyBetterBlockAsm12B: SHRQ $0x08, DI @@ -13703,35 +15625,57 @@ match_dst_size_check_encodeSnappyBetterBlockAsm12B: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B + JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm12B matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B - -matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B: - TESTL R8, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm12B - -matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: + JZ match_nolit_end_encodeSnappyBetterBlockAsm12B + +matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B: + CMPL R8, $0x01 + JL match_nolit_end_encodeSnappyBetterBlockAsm12B MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeSnappyBetterBlockAsm12B LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B match_nolit_end_encodeSnappyBetterBlockAsm12B: MOVL CX, R8 @@ -13906,52 +15850,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B: match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x34, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x34, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x32, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x34, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 65560(SP)(R11*4) - MOVL R15, 65560(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 65560(SP)(R11*4) + MOVL R14, 65560(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm12B: + CMPQ DI, R9 + JAE search_loop_encodeSnappyBetterBlockAsm12B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x32, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 65560(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm12B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeSnappyBetterBlockAsm12B emit_remainder_encodeSnappyBetterBlockAsm12B: MOVQ src_len+32(FP), CX @@ -13999,17 +15940,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm12B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: @@ -14092,7 +16052,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B: RET // func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56 MOVQ dst_base+0(FP), AX MOVQ $0x000000a0, CX @@ -14147,12 +16107,22 @@ search_loop_encodeSnappyBetterBlockAsm10B: MOVL 16408(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 16408(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm10B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm10B + CMPQ R11, DI + JNE no_short_found_encodeSnappyBetterBlockAsm10B + MOVL R8, SI + JMP candidate_match_encodeSnappyBetterBlockAsm10B + +no_short_found_encodeSnappyBetterBlockAsm10B: + CMPL R10, DI + JEQ candidate_match_encodeSnappyBetterBlockAsm10B + CMPL R11, DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm10B candidateS_match_encodeSnappyBetterBlockAsm10B: SHRQ $0x08, DI @@ -14206,35 +16176,57 @@ match_dst_size_check_encodeSnappyBetterBlockAsm10B: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B + JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm10B matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B - -matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B: - TESTL R8, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm10B - -matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: + JZ match_nolit_end_encodeSnappyBetterBlockAsm10B + +matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B: + CMPL R8, $0x01 + JL match_nolit_end_encodeSnappyBetterBlockAsm10B MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeSnappyBetterBlockAsm10B LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B match_nolit_end_encodeSnappyBetterBlockAsm10B: MOVL CX, R8 @@ -14409,52 +16401,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B: match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x36, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x34, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x36, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 16408(SP)(R11*4) - MOVL R15, 16408(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 16408(SP)(R11*4) + MOVL R14, 16408(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm10B: + CMPQ DI, R9 + JAE search_loop_encodeSnappyBetterBlockAsm10B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x34, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 16408(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm10B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeSnappyBetterBlockAsm10B emit_remainder_encodeSnappyBetterBlockAsm10B: MOVQ src_len+32(FP), CX @@ -14502,17 +16491,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm10B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: @@ -14595,7 +16603,7 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B: RET // func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int -// Requires: SSE2 +// Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000028, CX @@ -14650,12 +16658,22 @@ search_loop_encodeSnappyBetterBlockAsm8B: MOVL 4120(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 4120(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm8B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm8B + CMPQ R11, DI + JNE no_short_found_encodeSnappyBetterBlockAsm8B + MOVL R8, SI + JMP candidate_match_encodeSnappyBetterBlockAsm8B + +no_short_found_encodeSnappyBetterBlockAsm8B: + CMPL R10, DI + JEQ candidate_match_encodeSnappyBetterBlockAsm8B + CMPL R11, DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm8B candidateS_match_encodeSnappyBetterBlockAsm8B: SHRQ $0x08, DI @@ -14709,35 +16727,57 @@ match_dst_size_check_encodeSnappyBetterBlockAsm8B: // matchLen XORL R12, R12 CMPL R8, $0x08 - JL matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B + JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: MOVQ (R9)(R12*1), R11 XORQ (R10)(R12*1), R11 TESTQ R11, R11 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B - BSFQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL (R12)(R11*1), R12 + JMP match_nolit_end_encodeSnappyBetterBlockAsm8B matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B: LEAL -8(R8), R8 LEAL 8(R12), R12 CMPL R8, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B - -matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B: - TESTL R8, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm8B - -matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: + JZ match_nolit_end_encodeSnappyBetterBlockAsm8B + +matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B: + CMPL R8, $0x04 + JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B + MOVL (R9)(R12*1), R11 + CMPL (R10)(R12*1), R11 + JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B + SUBL $0x04, R8 + LEAL 4(R12), R12 + +matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B: + CMPL R8, $0x02 + JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B + MOVW (R9)(R12*1), R11 + CMPW (R10)(R12*1), R11 + JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B + SUBL $0x02, R8 + LEAL 2(R12), R12 + +matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B: + CMPL R8, $0x01 + JL match_nolit_end_encodeSnappyBetterBlockAsm8B MOVB (R9)(R12*1), R11 CMPB (R10)(R12*1), R11 JNE match_nolit_end_encodeSnappyBetterBlockAsm8B LEAL 1(R12), R12 - DECL R8 - JNZ matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B match_nolit_end_encodeSnappyBetterBlockAsm8B: MOVL CX, R8 @@ -14910,52 +16950,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B: match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x38, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x36, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x38, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 4120(SP)(R11*4) - MOVL R15, 4120(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 4120(SP)(R11*4) + MOVL R14, 4120(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm8B: + CMPQ DI, R9 + JAE search_loop_encodeSnappyBetterBlockAsm8B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x36, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 4120(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm8B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeSnappyBetterBlockAsm8B emit_remainder_encodeSnappyBetterBlockAsm8B: MOVQ src_len+32(FP), CX @@ -15003,17 +17040,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm8B: MOVL SI, BX // genMemMoveShort + CMPQ BX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 - JLE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8 + JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8: - MOVQ (CX), SI - MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2: + MOVB (CX), SI + MOVB -1(CX)(BX*1), CL + MOVB SI, (AX) + MOVB CL, -1(AX)(BX*1) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3: + MOVW (CX), SI + MOVB 2(CX), CL + MOVW SI, (AX) + MOVB CL, 2(AX) + JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7: + MOVL (CX), SI + MOVL -4(CX)(BX*1), CX + MOVL SI, (AX) + MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: @@ -15454,6 +17510,97 @@ four_bytes_remain_standalone: two_byte_offset_standalone: CMPL DX, $0x40 JLE two_byte_offset_short_standalone + CMPL CX, $0x00000800 + JAE long_offset_short_standalone + MOVL $0x00000001, SI + LEAL 16(SI), SI + MOVB CL, 1(AX) + MOVL CX, DI + SHRL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + SUBL $0x08, DX + + // emitRepeat + LEAL -4(DX), DX + JMP cant_repeat_two_offset_standalone_emit_copy_short_2b + +emit_repeat_again_standalone_emit_copy_short_2b: + MOVL DX, SI + LEAL -4(DX), DX + CMPL SI, $0x08 + JLE repeat_two_standalone_emit_copy_short_2b + CMPL SI, $0x0c + JGE cant_repeat_two_offset_standalone_emit_copy_short_2b + CMPL CX, $0x00000800 + JLT repeat_two_offset_standalone_emit_copy_short_2b + +cant_repeat_two_offset_standalone_emit_copy_short_2b: + CMPL DX, $0x00000104 + JLT repeat_three_standalone_emit_copy_short_2b + CMPL DX, $0x00010100 + JLT repeat_four_standalone_emit_copy_short_2b + CMPL DX, $0x0100ffff + JLT repeat_five_standalone_emit_copy_short_2b + LEAL -16842747(DX), DX + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + ADDQ $0x05, BX + JMP emit_repeat_again_standalone_emit_copy_short_2b + +repeat_five_standalone_emit_copy_short_2b: + LEAL -65536(DX), DX + MOVL DX, CX + MOVW $0x001d, (AX) + MOVW DX, 2(AX) + SARL $0x10, CX + MOVB CL, 4(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_copy_end + +repeat_four_standalone_emit_copy_short_2b: + LEAL -256(DX), DX + MOVW $0x0019, (AX) + MOVW DX, 2(AX) + ADDQ $0x04, BX + ADDQ $0x04, AX + JMP gen_emit_copy_end + +repeat_three_standalone_emit_copy_short_2b: + LEAL -4(DX), DX + MOVW $0x0015, (AX) + MOVB DL, 2(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + JMP gen_emit_copy_end + +repeat_two_standalone_emit_copy_short_2b: + SHLL $0x02, DX + ORL $0x01, DX + MOVW DX, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + +repeat_two_offset_standalone_emit_copy_short_2b: + XORQ SI, SI + LEAL 1(SI)(DX*4), DX + MOVB CL, 1(AX) + SARL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end + +long_offset_short_standalone: MOVB $0xee, (AX) MOVW CX, 1(AX) LEAL -60(DX), DX @@ -15635,6 +17782,7 @@ gen_emit_copy_end_snappy: RET // func matchLen(a []byte, b []byte) int +// Requires: BMI TEXT ·matchLen(SB), NOSPLIT, $0-56 MOVQ a_base+0(FP), AX MOVQ b_base+24(FP), CX @@ -15643,35 +17791,57 @@ TEXT ·matchLen(SB), NOSPLIT, $0-56 // matchLen XORL SI, SI CMPL DX, $0x08 - JL matchlen_single_standalone + JL matchlen_match4_standalone matchlen_loopback_standalone: MOVQ (AX)(SI*1), BX XORQ (CX)(SI*1), BX TESTQ BX, BX JZ matchlen_loop_standalone - BSFQ BX, BX - SARQ $0x03, BX - LEAL (SI)(BX*1), SI - JMP gen_match_len_end + +#ifdef GOAMD64_v3 + TZCNTQ BX, BX + +#else + BSFQ BX, BX + +#endif + SARQ $0x03, BX + LEAL (SI)(BX*1), SI + JMP gen_match_len_end matchlen_loop_standalone: LEAL -8(DX), DX LEAL 8(SI), SI CMPL DX, $0x08 JGE matchlen_loopback_standalone + JZ gen_match_len_end -matchlen_single_standalone: - TESTL DX, DX - JZ gen_match_len_end - -matchlen_single_loopback_standalone: +matchlen_match4_standalone: + CMPL DX, $0x04 + JL matchlen_match2_standalone + MOVL (AX)(SI*1), BX + CMPL (CX)(SI*1), BX + JNE matchlen_match2_standalone + SUBL $0x04, DX + LEAL 4(SI), SI + +matchlen_match2_standalone: + CMPL DX, $0x02 + JL matchlen_match1_standalone + MOVW (AX)(SI*1), BX + CMPW (CX)(SI*1), BX + JNE matchlen_match1_standalone + SUBL $0x02, DX + LEAL 2(SI), SI + +matchlen_match1_standalone: + CMPL DX, $0x01 + JL gen_match_len_end MOVB (AX)(SI*1), BL CMPB (CX)(SI*1), BL JNE gen_match_len_end LEAL 1(SI), SI - DECL DX - JNZ matchlen_single_loopback_standalone gen_match_len_end: MOVQ SI, ret+48(FP) diff --git a/vendor/github.com/klauspost/compress/s2/index.go b/vendor/github.com/klauspost/compress/s2/index.go index fd857682e46d..dd9ecfe71854 100644 --- a/vendor/github.com/klauspost/compress/s2/index.go +++ b/vendor/github.com/klauspost/compress/s2/index.go @@ -10,6 +10,7 @@ import ( "encoding/json" "fmt" "io" + "sort" ) const ( @@ -100,6 +101,15 @@ func (i *Index) Find(offset int64) (compressedOff, uncompressedOff int64, err er if offset > i.TotalUncompressed { return 0, 0, io.ErrUnexpectedEOF } + if len(i.info) > 200 { + n := sort.Search(len(i.info), func(n int) bool { + return i.info[n].uncompressedOffset > offset + }) + if n == 0 { + n = 1 + } + return i.info[n-1].compressedOffset, i.info[n-1].uncompressedOffset, nil + } for _, info := range i.info { if info.uncompressedOffset > offset { break @@ -523,3 +533,66 @@ func (i *Index) JSON() []byte { b, _ := json.MarshalIndent(x, "", " ") return b } + +// RemoveIndexHeaders will trim all headers and trailers from a given index. +// This is expected to save 20 bytes. +// These can be restored using RestoreIndexHeaders. +// This removes a layer of security, but is the most compact representation. +// Returns nil if headers contains errors. +// The returned slice references the provided slice. +func RemoveIndexHeaders(b []byte) []byte { + const save = 4 + len(S2IndexHeader) + len(S2IndexTrailer) + 4 + if len(b) <= save { + return nil + } + if b[0] != ChunkTypeIndex { + return nil + } + chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16 + b = b[4:] + + // Validate we have enough... + if len(b) < chunkLen { + return nil + } + b = b[:chunkLen] + + if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) { + return nil + } + b = b[len(S2IndexHeader):] + if !bytes.HasSuffix(b, []byte(S2IndexTrailer)) { + return nil + } + b = bytes.TrimSuffix(b, []byte(S2IndexTrailer)) + + if len(b) < 4 { + return nil + } + return b[:len(b)-4] +} + +// RestoreIndexHeaders will index restore headers removed by RemoveIndexHeaders. +// No error checking is performed on the input. +// If a 0 length slice is sent, it is returned without modification. +func RestoreIndexHeaders(in []byte) []byte { + if len(in) == 0 { + return in + } + b := make([]byte, 0, 4+len(S2IndexHeader)+len(in)+len(S2IndexTrailer)+4) + b = append(b, ChunkTypeIndex, 0, 0, 0) + b = append(b, []byte(S2IndexHeader)...) + b = append(b, in...) + + var tmp [4]byte + binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)+4+len(S2IndexTrailer))) + b = append(b, tmp[:4]...) + // Trailer + b = append(b, []byte(S2IndexTrailer)...) + + chunkLen := len(b) - skippableFrameHeader + b[1] = uint8(chunkLen >> 0) + b[2] = uint8(chunkLen >> 8) + b[3] = uint8(chunkLen >> 16) + return b +} diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md index c8f0f16fc1ec..65b38abed805 100644 --- a/vendor/github.com/klauspost/compress/zstd/README.md +++ b/vendor/github.com/klauspost/compress/zstd/README.md @@ -12,6 +12,8 @@ The `zstd` package is provided as open source software using a Go standard licen Currently the package is heavily optimized for 64 bit processors and will be significantly slower on 32 bit processors. +For seekable zstd streams, see [this excellent package](https://github.com/SaveTheRbtz/zstd-seekable-format-go). + ## Installation Install using `go get -u github.com/klauspost/compress`. The package is located in `github.com/klauspost/compress/zstd`. @@ -78,6 +80,9 @@ of a stream. This is independent of the `WithEncoderConcurrency(n)`, but that is in the future. So if you want to limit concurrency for future updates, specify the concurrency you would like. +If you would like stream encoding to be done without spawning async goroutines, use `WithEncoderConcurrency(1)` +which will compress input as each block is completed, blocking on writes until each has completed. + You can specify your desired compression level using `WithEncoderLevel()` option. Currently only pre-defined compression settings can be specified. @@ -104,7 +109,8 @@ and seems to ignore concatenated streams, even though [it is part of the spec](h For compressing small blocks, the returned encoder has a function called `EncodeAll(src, dst []byte) []byte`. `EncodeAll` will encode all input in src and append it to dst. -This function can be called concurrently, but each call will only run on a single goroutine. +This function can be called concurrently. +Each call will only run on a same goroutine as the caller. Encoded blocks can be concatenated and the result will be the combined input stream. Data compressed with EncodeAll can be decoded with the Decoder, using either a stream or `DecodeAll`. @@ -149,10 +155,10 @@ http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip This package: file out level insize outsize millis mb/s -silesia.tar zskp 1 211947520 73101992 643 313.87 -silesia.tar zskp 2 211947520 67504318 969 208.38 -silesia.tar zskp 3 211947520 64595893 2007 100.68 -silesia.tar zskp 4 211947520 60995370 8825 22.90 +silesia.tar zskp 1 211947520 73821326 634 318.47 +silesia.tar zskp 2 211947520 67655404 1508 133.96 +silesia.tar zskp 3 211947520 64746933 3000 67.37 +silesia.tar zskp 4 211947520 60073508 16926 11.94 cgo zstd: silesia.tar zstd 1 211947520 73605392 543 371.56 @@ -161,94 +167,94 @@ silesia.tar zstd 6 211947520 62916450 1913 105.66 silesia.tar zstd 9 211947520 60212393 5063 39.92 gzip, stdlib/this package: -silesia.tar gzstd 1 211947520 80007735 1654 122.21 -silesia.tar gzkp 1 211947520 80136201 1152 175.45 +silesia.tar gzstd 1 211947520 80007735 1498 134.87 +silesia.tar gzkp 1 211947520 80088272 1009 200.31 GOB stream of binary data. Highly compressible. https://files.klauspost.com/compress/gob-stream.7z file out level insize outsize millis mb/s -gob-stream zskp 1 1911399616 235022249 3088 590.30 -gob-stream zskp 2 1911399616 205669791 3786 481.34 -gob-stream zskp 3 1911399616 175034659 9636 189.17 -gob-stream zskp 4 1911399616 165609838 50369 36.19 +gob-stream zskp 1 1911399616 233948096 3230 564.34 +gob-stream zskp 2 1911399616 203997694 4997 364.73 +gob-stream zskp 3 1911399616 173526523 13435 135.68 +gob-stream zskp 4 1911399616 162195235 47559 38.33 gob-stream zstd 1 1911399616 249810424 2637 691.26 gob-stream zstd 3 1911399616 208192146 3490 522.31 gob-stream zstd 6 1911399616 193632038 6687 272.56 gob-stream zstd 9 1911399616 177620386 16175 112.70 -gob-stream gzstd 1 1911399616 357382641 10251 177.82 -gob-stream gzkp 1 1911399616 359753026 5438 335.20 +gob-stream gzstd 1 1911399616 357382013 9046 201.49 +gob-stream gzkp 1 1911399616 359136669 4885 373.08 The test data for the Large Text Compression Benchmark is the first 10^9 bytes of the English Wikipedia dump on Mar. 3, 2006. http://mattmahoney.net/dc/textdata.html file out level insize outsize millis mb/s -enwik9 zskp 1 1000000000 343848582 3609 264.18 -enwik9 zskp 2 1000000000 317276632 5746 165.97 -enwik9 zskp 3 1000000000 292243069 12162 78.41 -enwik9 zskp 4 1000000000 262183768 82837 11.51 +enwik9 zskp 1 1000000000 343833605 3687 258.64 +enwik9 zskp 2 1000000000 317001237 7672 124.29 +enwik9 zskp 3 1000000000 291915823 15923 59.89 +enwik9 zskp 4 1000000000 261710291 77697 12.27 enwik9 zstd 1 1000000000 358072021 3110 306.65 enwik9 zstd 3 1000000000 313734672 4784 199.35 enwik9 zstd 6 1000000000 295138875 10290 92.68 enwik9 zstd 9 1000000000 278348700 28549 33.40 -enwik9 gzstd 1 1000000000 382578136 9604 99.30 -enwik9 gzkp 1 1000000000 383825945 6544 145.73 +enwik9 gzstd 1 1000000000 382578136 8608 110.78 +enwik9 gzkp 1 1000000000 382781160 5628 169.45 Highly compressible JSON file. https://files.klauspost.com/compress/github-june-2days-2019.json.zst file out level insize outsize millis mb/s -github-june-2days-2019.json zskp 1 6273951764 699045015 10620 563.40 -github-june-2days-2019.json zskp 2 6273951764 617881763 11687 511.96 -github-june-2days-2019.json zskp 3 6273951764 524340691 34043 175.75 -github-june-2days-2019.json zskp 4 6273951764 470320075 170190 35.16 +github-june-2days-2019.json zskp 1 6273951764 697439532 9789 611.17 +github-june-2days-2019.json zskp 2 6273951764 610876538 18553 322.49 +github-june-2days-2019.json zskp 3 6273951764 517662858 44186 135.41 +github-june-2days-2019.json zskp 4 6273951764 464617114 165373 36.18 github-june-2days-2019.json zstd 1 6273951764 766284037 8450 708.00 github-june-2days-2019.json zstd 3 6273951764 661889476 10927 547.57 github-june-2days-2019.json zstd 6 6273951764 642756859 22996 260.18 github-june-2days-2019.json zstd 9 6273951764 601974523 52413 114.16 -github-june-2days-2019.json gzstd 1 6273951764 1164400847 29948 199.79 -github-june-2days-2019.json gzkp 1 6273951764 1125417694 21788 274.61 +github-june-2days-2019.json gzstd 1 6273951764 1164397768 26793 223.32 +github-june-2days-2019.json gzkp 1 6273951764 1120631856 17693 338.16 VM Image, Linux mint with a few installed applications: https://files.klauspost.com/compress/rawstudio-mint14.7z file out level insize outsize millis mb/s -rawstudio-mint14.tar zskp 1 8558382592 3667489370 20210 403.84 -rawstudio-mint14.tar zskp 2 8558382592 3364592300 31873 256.07 -rawstudio-mint14.tar zskp 3 8558382592 3158085214 77675 105.08 -rawstudio-mint14.tar zskp 4 8558382592 2965110639 857750 9.52 +rawstudio-mint14.tar zskp 1 8558382592 3718400221 18206 448.29 +rawstudio-mint14.tar zskp 2 8558382592 3326118337 37074 220.15 +rawstudio-mint14.tar zskp 3 8558382592 3163842361 87306 93.49 +rawstudio-mint14.tar zskp 4 8558382592 2970480650 783862 10.41 rawstudio-mint14.tar zstd 1 8558382592 3609250104 17136 476.27 rawstudio-mint14.tar zstd 3 8558382592 3341679997 29262 278.92 rawstudio-mint14.tar zstd 6 8558382592 3235846406 77904 104.77 rawstudio-mint14.tar zstd 9 8558382592 3160778861 140946 57.91 -rawstudio-mint14.tar gzstd 1 8558382592 3926257486 57722 141.40 -rawstudio-mint14.tar gzkp 1 8558382592 3962605659 45113 180.92 +rawstudio-mint14.tar gzstd 1 8558382592 3926234992 51345 158.96 +rawstudio-mint14.tar gzkp 1 8558382592 3960117298 36722 222.26 CSV data: https://files.klauspost.com/compress/nyc-taxi-data-10M.csv.zst file out level insize outsize millis mb/s -nyc-taxi-data-10M.csv zskp 1 3325605752 641339945 8925 355.35 -nyc-taxi-data-10M.csv zskp 2 3325605752 591748091 11268 281.44 -nyc-taxi-data-10M.csv zskp 3 3325605752 530289687 25239 125.66 -nyc-taxi-data-10M.csv zskp 4 3325605752 476268884 135958 23.33 +nyc-taxi-data-10M.csv zskp 1 3325605752 641319332 9462 335.17 +nyc-taxi-data-10M.csv zskp 2 3325605752 588976126 17570 180.50 +nyc-taxi-data-10M.csv zskp 3 3325605752 529329260 32432 97.79 +nyc-taxi-data-10M.csv zskp 4 3325605752 474949772 138025 22.98 nyc-taxi-data-10M.csv zstd 1 3325605752 687399637 8233 385.18 nyc-taxi-data-10M.csv zstd 3 3325605752 598514411 10065 315.07 nyc-taxi-data-10M.csv zstd 6 3325605752 570522953 20038 158.27 nyc-taxi-data-10M.csv zstd 9 3325605752 517554797 64565 49.12 -nyc-taxi-data-10M.csv gzstd 1 3325605752 928656485 23876 132.83 -nyc-taxi-data-10M.csv gzkp 1 3325605752 922257165 16780 189.00 +nyc-taxi-data-10M.csv gzstd 1 3325605752 928654908 21270 149.11 +nyc-taxi-data-10M.csv gzkp 1 3325605752 922273214 13929 227.68 ``` ## Decompressor @@ -283,8 +289,13 @@ func Decompress(in io.Reader, out io.Writer) error { } ``` -It is important to use the "Close" function when you no longer need the Reader to stop running goroutines. -See "Allocation-less operation" below. +It is important to use the "Close" function when you no longer need the Reader to stop running goroutines, +when running with default settings. +Goroutines will exit once an error has been returned, including `io.EOF` at the end of a stream. + +Streams are decoded concurrently in 4 asynchronous stages to give the best possible throughput. +However, if you prefer synchronous decompression, use `WithDecoderConcurrency(1)` which will decompress data +as it is being requested only. For decoding buffers, it could look something like this: @@ -293,7 +304,7 @@ import "github.com/klauspost/compress/zstd" // Create a reader that caches decompressors. // For this operation type we supply a nil Reader. -var decoder, _ = zstd.NewReader(nil) +var decoder, _ = zstd.NewReader(nil, WithDecoderConcurrency(0)) // Decompress a buffer. We don't supply a destination buffer, // so it will be allocated by the decoder. @@ -303,9 +314,12 @@ func Decompress(src []byte) ([]byte, error) { ``` Both of these cases should provide the functionality needed. -The decoder can be used for *concurrent* decompression of multiple buffers. +The decoder can be used for *concurrent* decompression of multiple buffers. +By default 4 decompressors will be created. + It will only allow a certain number of concurrent operations to run. -To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder. +To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder. +It is possible to use `WithDecoderConcurrency(0)` to create GOMAXPROCS decoders. ### Dictionaries @@ -357,62 +371,48 @@ In this case no unneeded allocations should be made. The buffer decoder does everything on the same goroutine and does nothing concurrently. It can however decode several buffers concurrently. Use `WithDecoderConcurrency(n)` to limit that. -The stream decoder operates on +The stream decoder will create goroutines that: -* One goroutine reads input and splits the input to several block decoders. -* A number of decoders will decode blocks. -* A goroutine coordinates these blocks and sends history from one to the next. +1) Reads input and splits the input into blocks. +2) Decompression of literals. +3) Decompression of sequences. +4) Reconstruction of output stream. So effectively this also means the decoder will "read ahead" and prepare data to always be available for output. +The concurrency level will, for streams, determine how many blocks ahead the compression will start. + Since "blocks" are quite dependent on the output of the previous block stream decoding will only have limited concurrency. -In practice this means that concurrency is often limited to utilizing about 2 cores effectively. - - +In practice this means that concurrency is often limited to utilizing about 3 cores effectively. + ### Benchmarks -These are some examples of performance compared to [datadog cgo library](https://github.com/DataDog/zstd). - The first two are streaming decodes and the last are smaller inputs. - + +Running on AMD Ryzen 9 3950X 16-Core Processor. AMD64 assembly used. + ``` -BenchmarkDecoderSilesia-8 3 385000067 ns/op 550.51 MB/s 5498 B/op 8 allocs/op -BenchmarkDecoderSilesiaCgo-8 6 197666567 ns/op 1072.25 MB/s 270672 B/op 8 allocs/op - -BenchmarkDecoderEnwik9-8 1 2027001600 ns/op 493.34 MB/s 10496 B/op 18 allocs/op -BenchmarkDecoderEnwik9Cgo-8 2 979499200 ns/op 1020.93 MB/s 270672 B/op 8 allocs/op - -Concurrent performance: - -BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16 28915 42469 ns/op 4340.07 MB/s 114 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16 116505 9965 ns/op 11900.16 MB/s 16 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16 8952 134272 ns/op 3588.70 MB/s 915 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16 11820 102538 ns/op 4161.90 MB/s 594 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16 34782 34184 ns/op 3661.88 MB/s 60 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16 27712 43447 ns/op 3500.58 MB/s 99 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16 62826 18750 ns/op 21845.10 MB/s 104 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16 631545 1794 ns/op 57078.74 MB/s 2 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16 1690140 712 ns/op 172938.13 MB/s 1 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16 10432 113593 ns/op 6180.73 MB/s 1143 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallel/html.zst-16 113206 10671 ns/op 9596.27 MB/s 15 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16 1530615 779 ns/op 5229.49 MB/s 0 B/op 0 allocs/op - -BenchmarkDecoder_DecodeAllParallelCgo/kppkn.gtb.zst-16 65217 16192 ns/op 11383.34 MB/s 46 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallelCgo/geo.protodata.zst-16 292671 4039 ns/op 29363.19 MB/s 6 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallelCgo/plrabn12.txt.zst-16 26314 46021 ns/op 10470.43 MB/s 293 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallelCgo/lcet10.txt.zst-16 33897 34900 ns/op 12227.96 MB/s 205 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallelCgo/asyoulik.txt.zst-16 104348 11433 ns/op 10949.01 MB/s 20 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallelCgo/alice29.txt.zst-16 75949 15510 ns/op 9805.60 MB/s 32 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallelCgo/html_x_4.zst-16 173910 6756 ns/op 60624.29 MB/s 37 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallelCgo/paper-100k.pdf.zst-16 923076 1339 ns/op 76474.87 MB/s 1 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallelCgo/fireworks.jpeg.zst-16 922920 1351 ns/op 91102.57 MB/s 2 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallelCgo/urls.10K.zst-16 27649 43618 ns/op 16096.19 MB/s 407 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallelCgo/html.zst-16 279073 4160 ns/op 24614.18 MB/s 6 B/op 0 allocs/op -BenchmarkDecoder_DecodeAllParallelCgo/comp-data.bin.zst-16 749938 1579 ns/op 2581.71 MB/s 0 B/op 0 allocs/op +BenchmarkDecoderSilesia-32 5 206878840 ns/op 1024.50 MB/s 49808 B/op 43 allocs/op +BenchmarkDecoderEnwik9-32 1 1271809000 ns/op 786.28 MB/s 72048 B/op 52 allocs/op + +Concurrent blocks, performance: + +BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32 67356 17857 ns/op 10321.96 MB/s 22.48 pct 102 B/op 0 allocs/op +BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32 266656 4421 ns/op 26823.21 MB/s 11.89 pct 19 B/op 0 allocs/op +BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32 20992 56842 ns/op 8477.17 MB/s 39.90 pct 754 B/op 0 allocs/op +BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32 27456 43932 ns/op 9714.01 MB/s 33.27 pct 524 B/op 0 allocs/op +BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32 78432 15047 ns/op 8319.15 MB/s 40.34 pct 66 B/op 0 allocs/op +BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32 65800 18436 ns/op 8249.63 MB/s 37.75 pct 88 B/op 0 allocs/op +BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32 102993 11523 ns/op 35546.09 MB/s 3.637 pct 143 B/op 0 allocs/op +BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32 1000000 1070 ns/op 95720.98 MB/s 80.53 pct 3 B/op 0 allocs/op +BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32 749802 1752 ns/op 70272.35 MB/s 100.0 pct 5 B/op 0 allocs/op +BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32 22640 52934 ns/op 13263.37 MB/s 26.25 pct 1014 B/op 0 allocs/op +BenchmarkDecoder_DecodeAllParallel/html.zst-32 226412 5232 ns/op 19572.27 MB/s 14.49 pct 20 B/op 0 allocs/op +BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32 923041 1276 ns/op 3194.71 MB/s 31.26 pct 0 B/op 0 allocs/op ``` -This reflects the performance around May 2020, but this may be out of date. +This reflects the performance around May 2022, but this may be out of date. ## Zstd inside ZIP files diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go index 753d17df634c..97299d499cf0 100644 --- a/vendor/github.com/klauspost/compress/zstd/bitreader.go +++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go @@ -7,6 +7,7 @@ package zstd import ( "encoding/binary" "errors" + "fmt" "io" "math/bits" ) @@ -62,13 +63,6 @@ func (b *bitReader) get32BitsFast(n uint8) uint32 { return v } -func (b *bitReader) get16BitsFast(n uint8) uint16 { - const regMask = 64 - 1 - v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask)) - b.bitsRead += n - return v -} - // fillFast() will make sure at least 32 bits are available. // There must be at least 4 bytes available. func (b *bitReader) fillFast() { @@ -132,6 +126,9 @@ func (b *bitReader) remain() uint { func (b *bitReader) close() error { // Release reference. b.in = nil + if !b.finished() { + return fmt.Errorf("%d extra bits on block, should be 0", b.remain()) + } if b.bitsRead > 64 { return io.ErrUnexpectedEOF } diff --git a/vendor/github.com/klauspost/compress/zstd/bitwriter.go b/vendor/github.com/klauspost/compress/zstd/bitwriter.go index b36618285095..78b3c61be3ec 100644 --- a/vendor/github.com/klauspost/compress/zstd/bitwriter.go +++ b/vendor/github.com/klauspost/compress/zstd/bitwriter.go @@ -5,8 +5,6 @@ package zstd -import "fmt" - // bitWriter will write bits. // First bit will be LSB of the first byte of output. type bitWriter struct { @@ -73,80 +71,6 @@ func (b *bitWriter) addBits16Clean(value uint16, bits uint8) { b.nBits += bits } -// flush will flush all pending full bytes. -// There will be at least 56 bits available for writing when this has been called. -// Using flush32 is faster, but leaves less space for writing. -func (b *bitWriter) flush() { - v := b.nBits >> 3 - switch v { - case 0: - case 1: - b.out = append(b.out, - byte(b.bitContainer), - ) - case 2: - b.out = append(b.out, - byte(b.bitContainer), - byte(b.bitContainer>>8), - ) - case 3: - b.out = append(b.out, - byte(b.bitContainer), - byte(b.bitContainer>>8), - byte(b.bitContainer>>16), - ) - case 4: - b.out = append(b.out, - byte(b.bitContainer), - byte(b.bitContainer>>8), - byte(b.bitContainer>>16), - byte(b.bitContainer>>24), - ) - case 5: - b.out = append(b.out, - byte(b.bitContainer), - byte(b.bitContainer>>8), - byte(b.bitContainer>>16), - byte(b.bitContainer>>24), - byte(b.bitContainer>>32), - ) - case 6: - b.out = append(b.out, - byte(b.bitContainer), - byte(b.bitContainer>>8), - byte(b.bitContainer>>16), - byte(b.bitContainer>>24), - byte(b.bitContainer>>32), - byte(b.bitContainer>>40), - ) - case 7: - b.out = append(b.out, - byte(b.bitContainer), - byte(b.bitContainer>>8), - byte(b.bitContainer>>16), - byte(b.bitContainer>>24), - byte(b.bitContainer>>32), - byte(b.bitContainer>>40), - byte(b.bitContainer>>48), - ) - case 8: - b.out = append(b.out, - byte(b.bitContainer), - byte(b.bitContainer>>8), - byte(b.bitContainer>>16), - byte(b.bitContainer>>24), - byte(b.bitContainer>>32), - byte(b.bitContainer>>40), - byte(b.bitContainer>>48), - byte(b.bitContainer>>56), - ) - default: - panic(fmt.Errorf("bits (%d) > 64", b.nBits)) - } - b.bitContainer >>= v << 3 - b.nBits &= 7 -} - // flush32 will flush out, so there are at least 32 bits available for writing. func (b *bitWriter) flush32() { if b.nBits < 32 { diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go index 8a98c4562e01..f52d1aed6fe6 100644 --- a/vendor/github.com/klauspost/compress/zstd/blockdec.go +++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go @@ -5,9 +5,13 @@ package zstd import ( + "bytes" + "encoding/binary" "errors" "fmt" "io" + "os" + "path/filepath" "sync" "github.com/klauspost/compress/huff0" @@ -38,14 +42,14 @@ const ( // maxCompressedBlockSize is the biggest allowed compressed block size (128KB) maxCompressedBlockSize = 128 << 10 + compressedBlockOverAlloc = 16 + maxCompressedBlockSizeAlloc = 128<<10 + compressedBlockOverAlloc + // Maximum possible block size (all Raw+Uncompressed). maxBlockSize = (1 << 21) - 1 - // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals_section_header - maxCompressedLiteralSize = 1 << 18 - maxRLELiteralSize = 1 << 20 - maxMatchLen = 131074 - maxSequences = 0x7f00 + 0xffff + maxMatchLen = 131074 + maxSequences = 0x7f00 + 0xffff // We support slightly less than the reference decoder to be able to // use ints on 32 bit archs. @@ -76,20 +80,27 @@ type blockDec struct { // Window size of the block. WindowSize uint64 - history chan *history - input chan struct{} - result chan decodeOutput - sequenceBuf []seq - err error - decWG sync.WaitGroup + err error + + // Check against this crc + checkCRC []byte // Frame to use for singlethreaded decoding. // Should not be used by the decoder itself since parent may be another frame. localFrame *frameDec + sequence []seqVals + + async struct { + newHist *history + literals []byte + seqData []byte + seqSize int // Size of uncompressed sequences + fcs uint64 + } + // Block is RLE, this is the size. RLESize uint32 - tmp [4]byte Type blockType @@ -109,13 +120,8 @@ func (b *blockDec) String() string { func newBlockDec(lowMem bool) *blockDec { b := blockDec{ - lowMem: lowMem, - result: make(chan decodeOutput, 1), - input: make(chan struct{}, 1), - history: make(chan *history, 1), + lowMem: lowMem, } - b.decWG.Add(1) - go b.startDecoder() return &b } @@ -133,11 +139,17 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error { b.Type = blockType((bh >> 1) & 3) // find size. cSize := int(bh >> 3) - maxSize := maxBlockSize + maxSize := maxCompressedBlockSizeAlloc switch b.Type { case blockTypeReserved: return ErrReservedBlockType case blockTypeRLE: + if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) { + if debugDecoder { + printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b) + } + return ErrWindowSizeExceeded + } b.RLESize = uint32(cSize) if b.lowMem { maxSize = cSize @@ -148,9 +160,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error { println("Data size on stream:", cSize) } b.RLESize = 0 - maxSize = maxCompressedBlockSize + maxSize = maxCompressedBlockSizeAlloc if windowSize < maxCompressedBlockSize && b.lowMem { - maxSize = int(windowSize) + maxSize = int(windowSize) + compressedBlockOverAlloc } if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize { if debugDecoder { @@ -158,7 +170,19 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error { } return ErrCompressedSizeTooBig } + // Empty compressed blocks must at least be 2 bytes + // for Literals_Block_Type and one for Sequences_Section_Header. + if cSize < 2 { + return ErrBlockTooSmall + } case blockTypeRaw: + if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) { + if debugDecoder { + printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b) + } + return ErrWindowSizeExceeded + } + b.RLESize = 0 // We do not need a destination for raw blocks. maxSize = -1 @@ -169,9 +193,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error { // Read block data. if cap(b.dataStorage) < cSize { if b.lowMem || cSize > maxCompressedBlockSize { - b.dataStorage = make([]byte, 0, cSize) + b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc) } else { - b.dataStorage = make([]byte, 0, maxCompressedBlockSize) + b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc) } } if cap(b.dst) <= maxSize { @@ -193,85 +217,14 @@ func (b *blockDec) sendErr(err error) { b.Last = true b.Type = blockTypeReserved b.err = err - b.input <- struct{}{} } // Close will release resources. // Closed blockDec cannot be reset. func (b *blockDec) Close() { - close(b.input) - close(b.history) - close(b.result) - b.decWG.Wait() } -// decodeAsync will prepare decoding the block when it receives input. -// This will separate output and history. -func (b *blockDec) startDecoder() { - defer b.decWG.Done() - for range b.input { - //println("blockDec: Got block input") - switch b.Type { - case blockTypeRLE: - if cap(b.dst) < int(b.RLESize) { - if b.lowMem { - b.dst = make([]byte, b.RLESize) - } else { - b.dst = make([]byte, maxBlockSize) - } - } - o := decodeOutput{ - d: b, - b: b.dst[:b.RLESize], - err: nil, - } - v := b.data[0] - for i := range o.b { - o.b[i] = v - } - hist := <-b.history - hist.append(o.b) - b.result <- o - case blockTypeRaw: - o := decodeOutput{ - d: b, - b: b.data, - err: nil, - } - hist := <-b.history - hist.append(o.b) - b.result <- o - case blockTypeCompressed: - b.dst = b.dst[:0] - err := b.decodeCompressed(nil) - o := decodeOutput{ - d: b, - b: b.dst, - err: err, - } - if debugDecoder { - println("Decompressed to", len(b.dst), "bytes, error:", err) - } - b.result <- o - case blockTypeReserved: - // Used for returning errors. - <-b.history - b.result <- decodeOutput{ - d: b, - b: nil, - err: b.err, - } - default: - panic("Invalid block type") - } - if debugDecoder { - println("blockDec: Finished block") - } - } -} - -// decodeAsync will prepare decoding the block when it receives the history. -// If history is provided, it will not fetch it from the channel. +// decodeBuf func (b *blockDec) decodeBuf(hist *history) error { switch b.Type { case blockTypeRLE: @@ -294,14 +247,23 @@ func (b *blockDec) decodeBuf(hist *history) error { return nil case blockTypeCompressed: saved := b.dst - b.dst = hist.b - hist.b = nil + // Append directly to history + if hist.ignoreBuffer == 0 { + b.dst = hist.b + hist.b = nil + } else { + b.dst = b.dst[:0] + } err := b.decodeCompressed(hist) if debugDecoder { println("Decompressed to total", len(b.dst), "bytes, hash:", xxhash.Sum64(b.dst), "error:", err) } - hist.b = b.dst - b.dst = saved + if hist.ignoreBuffer == 0 { + hist.b = b.dst + b.dst = saved + } else { + hist.appendKeep(b.dst) + } return err case blockTypeReserved: // Used for returning errors. @@ -311,30 +273,18 @@ func (b *blockDec) decodeBuf(hist *history) error { } } -// decodeCompressed will start decompressing a block. -// If no history is supplied the decoder will decodeAsync as much as possible -// before fetching from blockDec.history -func (b *blockDec) decodeCompressed(hist *history) error { - in := b.data - delayedHistory := hist == nil - - if delayedHistory { - // We must always grab history. - defer func() { - if hist == nil { - <-b.history - } - }() - } +func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err error) { // There must be at least one byte for Literals_Block_Type and one for Sequences_Section_Header if len(in) < 2 { - return ErrBlockTooSmall + return in, ErrBlockTooSmall } + litType := literalsBlockType(in[0] & 3) var litRegenSize int var litCompSize int sizeFormat := (in[0] >> 2) & 3 var fourStreams bool + var literals []byte switch litType { case literalsBlockRaw, literalsBlockRLE: switch sizeFormat { @@ -350,7 +300,7 @@ func (b *blockDec) decodeCompressed(hist *history) error { // Regenerated_Size uses 20 bits (0-1048575). Literals_Section_Header uses 3 bytes. if len(in) < 3 { println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in)) - return ErrBlockTooSmall + return in, ErrBlockTooSmall } litRegenSize = int(in[0]>>4) + (int(in[1]) << 4) + (int(in[2]) << 12) in = in[3:] @@ -361,7 +311,7 @@ func (b *blockDec) decodeCompressed(hist *history) error { // Both Regenerated_Size and Compressed_Size use 10 bits (0-1023). if len(in) < 3 { println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in)) - return ErrBlockTooSmall + return in, ErrBlockTooSmall } n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) litRegenSize = int(n & 1023) @@ -372,7 +322,7 @@ func (b *blockDec) decodeCompressed(hist *history) error { fourStreams = true if len(in) < 4 { println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in)) - return ErrBlockTooSmall + return in, ErrBlockTooSmall } n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20) litRegenSize = int(n & 16383) @@ -382,7 +332,7 @@ func (b *blockDec) decodeCompressed(hist *history) error { fourStreams = true if len(in) < 5 { println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in)) - return ErrBlockTooSmall + return in, ErrBlockTooSmall } n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20) + (uint64(in[4]) << 28) litRegenSize = int(n & 262143) @@ -393,13 +343,15 @@ func (b *blockDec) decodeCompressed(hist *history) error { if debugDecoder { println("literals type:", litType, "litRegenSize:", litRegenSize, "litCompSize:", litCompSize, "sizeFormat:", sizeFormat, "4X:", fourStreams) } - var literals []byte - var huff *huff0.Scratch + if litRegenSize > int(b.WindowSize) || litRegenSize > maxCompressedBlockSize { + return in, ErrWindowSizeExceeded + } + switch litType { case literalsBlockRaw: if len(in) < litRegenSize { println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litRegenSize) - return ErrBlockTooSmall + return in, ErrBlockTooSmall } literals = in[:litRegenSize] in = in[litRegenSize:] @@ -407,19 +359,13 @@ func (b *blockDec) decodeCompressed(hist *history) error { case literalsBlockRLE: if len(in) < 1 { println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", 1) - return ErrBlockTooSmall + return in, ErrBlockTooSmall } if cap(b.literalBuf) < litRegenSize { if b.lowMem { - b.literalBuf = make([]byte, litRegenSize) + b.literalBuf = make([]byte, litRegenSize, litRegenSize+compressedBlockOverAlloc) } else { - if litRegenSize > maxCompressedLiteralSize { - // Exceptional - b.literalBuf = make([]byte, litRegenSize) - } else { - b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize) - - } + b.literalBuf = make([]byte, litRegenSize, maxCompressedBlockSize+compressedBlockOverAlloc) } } literals = b.literalBuf[:litRegenSize] @@ -434,7 +380,7 @@ func (b *blockDec) decodeCompressed(hist *history) error { case literalsBlockTreeless: if len(in) < litCompSize { println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize) - return ErrBlockTooSmall + return in, ErrBlockTooSmall } // Store compressed literals, so we defer decoding until we get history. literals = in[:litCompSize] @@ -442,31 +388,65 @@ func (b *blockDec) decodeCompressed(hist *history) error { if debugDecoder { printf("Found %d compressed literals\n", litCompSize) } + huff := hist.huffTree + if huff == nil { + return in, errors.New("literal block was treeless, but no history was defined") + } + // Ensure we have space to store it. + if cap(b.literalBuf) < litRegenSize { + if b.lowMem { + b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc) + } else { + b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc) + } + } + var err error + // Use our out buffer. + huff.MaxDecodedSize = litRegenSize + if fourStreams { + literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals) + } else { + literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals) + } + // Make sure we don't leak our literals buffer + if err != nil { + println("decompressing literals:", err) + return in, err + } + if len(literals) != litRegenSize { + return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals)) + } + case literalsBlockCompressed: if len(in) < litCompSize { println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize) - return ErrBlockTooSmall + return in, ErrBlockTooSmall } literals = in[:litCompSize] in = in[litCompSize:] - huff = huffDecoderPool.Get().(*huff0.Scratch) - var err error // Ensure we have space to store it. if cap(b.literalBuf) < litRegenSize { if b.lowMem { - b.literalBuf = make([]byte, 0, litRegenSize) + b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc) } else { - b.literalBuf = make([]byte, 0, maxCompressedLiteralSize) + b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc) } } - if huff == nil { - huff = &huff0.Scratch{} + huff := hist.huffTree + if huff == nil || (hist.dict != nil && huff == hist.dict.litEnc) { + huff = huffDecoderPool.Get().(*huff0.Scratch) + if huff == nil { + huff = &huff0.Scratch{} + } } + var err error huff, literals, err = huff0.ReadTable(literals, huff) if err != nil { println("reading huffman table:", err) - return err + return in, err } + hist.huffTree = huff + huff.MaxDecodedSize = litRegenSize // Use our out buffer. if fourStreams { literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals) @@ -475,27 +455,63 @@ func (b *blockDec) decodeCompressed(hist *history) error { } if err != nil { println("decoding compressed literals:", err) - return err + return in, err } // Make sure we don't leak our literals buffer if len(literals) != litRegenSize { - return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals)) + return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals)) } + // Re-cap to get extra size. + literals = b.literalBuf[:len(literals)] if debugDecoder { printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize) } } + hist.decoders.literals = literals + return in, nil +} + +// decodeCompressed will start decompressing a block. +func (b *blockDec) decodeCompressed(hist *history) error { + in := b.data + in, err := b.decodeLiterals(in, hist) + if err != nil { + return err + } + err = b.prepareSequences(in, hist) + if err != nil { + return err + } + if hist.decoders.nSeqs == 0 { + b.dst = append(b.dst, hist.decoders.literals...) + return nil + } + before := len(hist.decoders.out) + err = hist.decoders.decodeSync(hist.b[hist.ignoreBuffer:]) + if err != nil { + return err + } + if hist.decoders.maxSyncLen > 0 { + hist.decoders.maxSyncLen += uint64(before) + hist.decoders.maxSyncLen -= uint64(len(hist.decoders.out)) + } + b.dst = hist.decoders.out + hist.recentOffsets = hist.decoders.prevOffset + return nil +} +func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) { + if debugDecoder { + printf("prepareSequences: %d byte(s) input\n", len(in)) + } // Decode Sequences // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequences-section if len(in) < 1 { return ErrBlockTooSmall } + var nSeqs int seqHeader := in[0] - nSeqs := 0 switch { - case seqHeader == 0: - in = in[1:] case seqHeader < 128: nSeqs = int(seqHeader) in = in[1:] @@ -512,19 +528,16 @@ func (b *blockDec) decodeCompressed(hist *history) error { nSeqs = 0x7f00 + int(in[1]) + (int(in[2]) << 8) in = in[3:] } - // Allocate sequences - if cap(b.sequenceBuf) < nSeqs { - if b.lowMem { - b.sequenceBuf = make([]seq, nSeqs) - } else { - // Allocate max - b.sequenceBuf = make([]seq, nSeqs, maxSequences) + if nSeqs == 0 && len(in) != 0 { + // When no sequences, there should not be any more data... + if debugDecoder { + printf("prepareSequences: 0 sequences, but %d byte(s) left on stream\n", len(in)) } - } else { - // Reuse buffer - b.sequenceBuf = b.sequenceBuf[:nSeqs] + return ErrUnexpectedBlockSize } - var seqs = &sequenceDecs{} + + var seqs = &hist.decoders + seqs.nSeqs = nSeqs if nSeqs > 0 { if len(in) < 1 { return ErrBlockTooSmall @@ -553,6 +566,9 @@ func (b *blockDec) decodeCompressed(hist *history) error { } switch mode { case compModePredefined: + if seq.fse != nil && !seq.fse.preDefined { + fseDecoderPool.Put(seq.fse) + } seq.fse = &fsePredef[i] case compModeRLE: if br.remain() < 1 { @@ -560,34 +576,36 @@ func (b *blockDec) decodeCompressed(hist *history) error { } v := br.Uint8() br.advance(1) - dec := fseDecoderPool.Get().(*fseDecoder) + if seq.fse == nil || seq.fse.preDefined { + seq.fse = fseDecoderPool.Get().(*fseDecoder) + } symb, err := decSymbolValue(v, symbolTableX[i]) if err != nil { printf("RLE Transform table (%v) error: %v", tableIndex(i), err) return err } - dec.setRLE(symb) - seq.fse = dec + seq.fse.setRLE(symb) if debugDecoder { printf("RLE set to %+v, code: %v", symb, v) } case compModeFSE: println("Reading table for", tableIndex(i)) - dec := fseDecoderPool.Get().(*fseDecoder) - err := dec.readNCount(&br, uint16(maxTableSymbol[i])) + if seq.fse == nil || seq.fse.preDefined { + seq.fse = fseDecoderPool.Get().(*fseDecoder) + } + err := seq.fse.readNCount(&br, uint16(maxTableSymbol[i])) if err != nil { println("Read table error:", err) return err } - err = dec.transform(symbolTableX[i]) + err = seq.fse.transform(symbolTableX[i]) if err != nil { println("Transform table error:", err) return err } if debugDecoder { - println("Read table ok", "symbolLen:", dec.symbolLen) + println("Read table ok", "symbolLen:", seq.fse.symbolLen) } - seq.fse = dec case compModeRepeat: seq.repeat = true } @@ -597,140 +615,106 @@ func (b *blockDec) decodeCompressed(hist *history) error { } in = br.unread() } - - // Wait for history. - // All time spent after this is critical since it is strictly sequential. - if hist == nil { - hist = <-b.history - if hist.error { - return ErrDecoderClosed - } - } - - // Decode treeless literal block. - if litType == literalsBlockTreeless { - // TODO: We could send the history early WITHOUT the stream history. - // This would allow decoding treeless literals before the byte history is available. - // Silencia stats: Treeless 4393, with: 32775, total: 37168, 11% treeless. - // So not much obvious gain here. - - if hist.huffTree == nil { - return errors.New("literal block was treeless, but no history was defined") - } - // Ensure we have space to store it. - if cap(b.literalBuf) < litRegenSize { - if b.lowMem { - b.literalBuf = make([]byte, 0, litRegenSize) - } else { - b.literalBuf = make([]byte, 0, maxCompressedLiteralSize) - } - } - var err error - // Use our out buffer. - huff = hist.huffTree - if fourStreams { - literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals) - } else { - literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals) - } - // Make sure we don't leak our literals buffer - if err != nil { - println("decompressing literals:", err) - return err - } - if len(literals) != litRegenSize { - return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals)) - } - } else { - if hist.huffTree != nil && huff != nil { - if hist.dict == nil || hist.dict.litEnc != hist.huffTree { - huffDecoderPool.Put(hist.huffTree) - } - hist.huffTree = nil - } - } - if huff != nil { - hist.huffTree = huff - } if debugDecoder { - println("Final literals:", len(literals), "hash:", xxhash.Sum64(literals), "and", nSeqs, "sequences.") + println("Literals:", len(seqs.literals), "hash:", xxhash.Sum64(seqs.literals), "and", seqs.nSeqs, "sequences.") } if nSeqs == 0 { - // Decompressed content is defined entirely as Literals Section content. - b.dst = append(b.dst, literals...) - if delayedHistory { - hist.append(literals) + if len(b.sequence) > 0 { + b.sequence = b.sequence[:0] } return nil } + br := seqs.br + if br == nil { + br = &bitReader{} + } + if err := br.init(in); err != nil { + return err + } - seqs, err := seqs.mergeHistory(&hist.decoders) - if err != nil { + if err := seqs.initialize(br, hist, b.dst); err != nil { + println("initializing sequences:", err) return err } - if debugDecoder { - println("History merged ok") + // Extract blocks... + if false && hist.dict == nil { + fatalErr := func(err error) { + if err != nil { + panic(err) + } + } + fn := fmt.Sprintf("n-%d-lits-%d-prev-%d-%d-%d-win-%d.blk", hist.decoders.nSeqs, len(hist.decoders.literals), hist.recentOffsets[0], hist.recentOffsets[1], hist.recentOffsets[2], hist.windowSize) + var buf bytes.Buffer + fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.litLengths.fse)) + fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse)) + fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse)) + buf.Write(in) + os.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm) } - br := &bitReader{} - if err := br.init(in); err != nil { - return err + + return nil +} + +func (b *blockDec) decodeSequences(hist *history) error { + if cap(b.sequence) < hist.decoders.nSeqs { + if b.lowMem { + b.sequence = make([]seqVals, 0, hist.decoders.nSeqs) + } else { + b.sequence = make([]seqVals, 0, 0x7F00+0xffff) + } } + b.sequence = b.sequence[:hist.decoders.nSeqs] + if hist.decoders.nSeqs == 0 { + hist.decoders.seqSize = len(hist.decoders.literals) + return nil + } + hist.decoders.windowSize = hist.windowSize + hist.decoders.prevOffset = hist.recentOffsets - // TODO: Investigate if sending history without decoders are faster. - // This would allow the sequences to be decoded async and only have to construct stream history. - // If only recent offsets were not transferred, this would be an obvious win. - // Also, if first 3 sequences don't reference recent offsets, all sequences can be decoded. + err := hist.decoders.decode(b.sequence) + hist.recentOffsets = hist.decoders.prevOffset + return err +} +func (b *blockDec) executeSequences(hist *history) error { hbytes := hist.b if len(hbytes) > hist.windowSize { hbytes = hbytes[len(hbytes)-hist.windowSize:] - // We do not need history any more. + // We do not need history anymore. if hist.dict != nil { hist.dict.content = nil } } - - if err := seqs.initialize(br, hist, literals, b.dst); err != nil { - println("initializing sequences:", err) - return err - } - - err = seqs.decode(nSeqs, br, hbytes) + hist.decoders.windowSize = hist.windowSize + hist.decoders.out = b.dst[:0] + err := hist.decoders.execute(b.sequence, hbytes) if err != nil { return err } - if !br.finished() { - return fmt.Errorf("%d extra bits on block, should be 0", br.remain()) - } + return b.updateHistory(hist) +} - err = br.close() - if err != nil { - printf("Closing sequences: %v, %+v\n", err, *br) - } +func (b *blockDec) updateHistory(hist *history) error { if len(b.data) > maxCompressedBlockSize { return fmt.Errorf("compressed block size too large (%d)", len(b.data)) } // Set output and release references. - b.dst = seqs.out - seqs.out, seqs.literals, seqs.hist = nil, nil, nil + b.dst = hist.decoders.out + hist.recentOffsets = hist.decoders.prevOffset - if !delayedHistory { - // If we don't have delayed history, no need to update. - hist.recentOffsets = seqs.prevOffset - return nil - } if b.Last { // if last block we don't care about history. println("Last block, no history returned") hist.b = hist.b[:0] return nil + } else { + hist.append(b.dst) + if debugDecoder { + println("Finished block with ", len(b.sequence), "sequences. Added", len(b.dst), "to history, now length", len(hist.b)) + } } - hist.append(b.dst) - hist.recentOffsets = seqs.prevOffset - if debugDecoder { - println("Finished block with literals:", len(literals), "and", nSeqs, "sequences.") - } + hist.decoders.out, hist.decoders.literals = nil, nil return nil } diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go index aab71c6cf851..176788f25976 100644 --- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go +++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go @@ -7,7 +7,6 @@ package zstd import ( "fmt" "io" - "io/ioutil" ) type byteBuffer interface { @@ -23,7 +22,7 @@ type byteBuffer interface { readByte() (byte, error) // Skip n bytes. - skipN(n int) error + skipN(n int64) error } // in-memory buffer @@ -52,10 +51,6 @@ func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) { return r, nil } -func (b *byteBuf) remain() []byte { - return *b -} - func (b *byteBuf) readByte() (byte, error) { bb := *b if len(bb) < 1 { @@ -66,9 +61,12 @@ func (b *byteBuf) readByte() (byte, error) { return r, nil } -func (b *byteBuf) skipN(n int) error { +func (b *byteBuf) skipN(n int64) error { bb := *b - if len(bb) < n { + if n < 0 { + return fmt.Errorf("negative skip (%d) requested", n) + } + if int64(len(bb)) < n { return io.ErrUnexpectedEOF } *b = bb[n:] @@ -113,6 +111,9 @@ func (r *readerWrapper) readBig(n int, dst []byte) ([]byte, error) { func (r *readerWrapper) readByte() (byte, error) { n2, err := r.r.Read(r.tmp[:1]) if err != nil { + if err == io.EOF { + err = io.ErrUnexpectedEOF + } return 0, err } if n2 != 1 { @@ -121,9 +122,9 @@ func (r *readerWrapper) readByte() (byte, error) { return r.tmp[0], nil } -func (r *readerWrapper) skipN(n int) error { - n2, err := io.CopyN(ioutil.Discard, r.r, int64(n)) - if n2 != int64(n) { +func (r *readerWrapper) skipN(n int64) error { + n2, err := io.CopyN(io.Discard, r.r, n) + if n2 != n { err = io.ErrUnexpectedEOF } return err diff --git a/vendor/github.com/klauspost/compress/zstd/bytereader.go b/vendor/github.com/klauspost/compress/zstd/bytereader.go index 2c4fca17fa1d..0e59a242d8dc 100644 --- a/vendor/github.com/klauspost/compress/zstd/bytereader.go +++ b/vendor/github.com/klauspost/compress/zstd/bytereader.go @@ -13,12 +13,6 @@ type byteReader struct { off int } -// init will initialize the reader and set the input. -func (b *byteReader) init(in []byte) { - b.b = in - b.off = 0 -} - // advance the stream b n bytes. func (b *byteReader) advance(n uint) { b.off += int(n) diff --git a/vendor/github.com/klauspost/compress/zstd/decodeheader.go b/vendor/github.com/klauspost/compress/zstd/decodeheader.go index 69736e8d4bb8..5022e71c8363 100644 --- a/vendor/github.com/klauspost/compress/zstd/decodeheader.go +++ b/vendor/github.com/klauspost/compress/zstd/decodeheader.go @@ -5,6 +5,7 @@ package zstd import ( "bytes" + "encoding/binary" "errors" "io" ) @@ -15,18 +16,50 @@ const HeaderMaxSize = 14 + 3 // Header contains information about the first frame and block within that. type Header struct { - // Window Size the window of data to keep while decoding. - // Will only be set if HasFCS is false. - WindowSize uint64 + // SingleSegment specifies whether the data is to be decompressed into a + // single contiguous memory segment. + // It implies that WindowSize is invalid and that FrameContentSize is valid. + SingleSegment bool - // Frame content size. - // Expected size of the entire frame. - FrameContentSize uint64 + // WindowSize is the window of data to keep while decoding. + // Will only be set if SingleSegment is false. + WindowSize uint64 // Dictionary ID. // If 0, no dictionary. DictionaryID uint32 + // HasFCS specifies whether FrameContentSize has a valid value. + HasFCS bool + + // FrameContentSize is the expected uncompressed size of the entire frame. + FrameContentSize uint64 + + // Skippable will be true if the frame is meant to be skipped. + // This implies that FirstBlock.OK is false. + Skippable bool + + // SkippableID is the user-specific ID for the skippable frame. + // Valid values are between 0 to 15, inclusive. + SkippableID int + + // SkippableSize is the length of the user data to skip following + // the header. + SkippableSize uint32 + + // HeaderSize is the raw size of the frame header. + // + // For normal frames, it includes the size of the magic number and + // the size of the header (per section 3.1.1.1). + // It does not include the size for any data blocks (section 3.1.1.2) nor + // the size for the trailing content checksum. + // + // For skippable frames, this counts the size of the magic number + // along with the size of the size field of the payload. + // It does not include the size of the skippable payload itself. + // The total frame size is the HeaderSize plus the SkippableSize. + HeaderSize int + // First block information. FirstBlock struct { // OK will be set if first block could be decoded. @@ -51,17 +84,9 @@ type Header struct { CompressedSize int } - // Skippable will be true if the frame is meant to be skipped. - // No other information will be populated. - Skippable bool - // If set there is a checksum present for the block content. + // The checksum field at the end is always 4 bytes long. HasCheckSum bool - - // If this is true FrameContentSize will have a valid value - HasFCS bool - - SingleSegment bool } // Decode the header from the beginning of the stream. @@ -71,39 +96,46 @@ type Header struct { // If there isn't enough input, io.ErrUnexpectedEOF is returned. // The FirstBlock.OK will indicate if enough information was available to decode the first block header. func (h *Header) Decode(in []byte) error { + *h = Header{} if len(in) < 4 { return io.ErrUnexpectedEOF } + h.HeaderSize += 4 b, in := in[:4], in[4:] if !bytes.Equal(b, frameMagic) { if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 { return ErrMagicMismatch } - *h = Header{Skippable: true} + if len(in) < 4 { + return io.ErrUnexpectedEOF + } + h.HeaderSize += 4 + h.Skippable = true + h.SkippableID = int(b[0] & 0xf) + h.SkippableSize = binary.LittleEndian.Uint32(in) return nil } + + // Read Window_Descriptor + // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor if len(in) < 1 { return io.ErrUnexpectedEOF } - - // Clear output - *h = Header{} fhd, in := in[0], in[1:] + h.HeaderSize++ h.SingleSegment = fhd&(1<<5) != 0 h.HasCheckSum = fhd&(1<<2) != 0 - if fhd&(1<<3) != 0 { return errors.New("reserved bit set on frame header") } - // Read Window_Descriptor - // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor if !h.SingleSegment { if len(in) < 1 { return io.ErrUnexpectedEOF } var wd byte wd, in = in[0], in[1:] + h.HeaderSize++ windowLog := 10 + (wd >> 3) windowBase := uint64(1) << windowLog windowAdd := (windowBase / 8) * uint64(wd&0x7) @@ -120,9 +152,7 @@ func (h *Header) Decode(in []byte) error { return io.ErrUnexpectedEOF } b, in = in[:size], in[size:] - if b == nil { - return io.ErrUnexpectedEOF - } + h.HeaderSize += int(size) switch size { case 1: h.DictionaryID = uint32(b[0]) @@ -152,9 +182,7 @@ func (h *Header) Decode(in []byte) error { return io.ErrUnexpectedEOF } b, in = in[:fcsSize], in[fcsSize:] - if b == nil { - return io.ErrUnexpectedEOF - } + h.HeaderSize += int(fcsSize) switch fcsSize { case 1: h.FrameContentSize = uint64(b[0]) diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go index f430f58b5726..78c10755f88a 100644 --- a/vendor/github.com/klauspost/compress/zstd/decoder.go +++ b/vendor/github.com/klauspost/compress/zstd/decoder.go @@ -5,9 +5,13 @@ package zstd import ( - "errors" + "bytes" + "context" + "encoding/binary" "io" "sync" + + "github.com/klauspost/compress/zstd/internal/xxhash" ) // Decoder provides decoding of zstandard streams. @@ -22,12 +26,20 @@ type Decoder struct { // Unreferenced decoders, ready for use. decoders chan *blockDec - // Streams ready to be decoded. - stream chan decodeStream - // Current read position used for Reader functionality. current decoderState + // sync stream decoding + syncStream struct { + decodedFrame uint64 + br readerWrapper + enabled bool + inFrame bool + dstBuf []byte + } + + frame *frameDec + // Custom dictionaries. // Always uses copies. dicts map[uint32]dict @@ -46,7 +58,10 @@ type decoderState struct { output chan decodeOutput // cancel remaining output. - cancel chan struct{} + cancel context.CancelFunc + + // crc of current frame + crc *xxhash.Digest flushed bool } @@ -81,7 +96,7 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) { return nil, err } } - d.current.output = make(chan decodeOutput, d.o.concurrent) + d.current.crc = xxhash.New() d.current.flushed = true if r == nil { @@ -130,7 +145,7 @@ func (d *Decoder) Read(p []byte) (int, error) { break } if !d.nextBlock(n == 0) { - return n, nil + return n, d.current.err } } } @@ -162,6 +177,7 @@ func (d *Decoder) Reset(r io.Reader) error { d.drainOutput() + d.syncStream.br.r = nil if r == nil { d.current.err = ErrDecoderNilInput if len(d.current.b) > 0 { @@ -172,21 +188,23 @@ func (d *Decoder) Reset(r io.Reader) error { } // If bytes buffer and < 5MB, do sync decoding anyway. - if bb, ok := r.(byter); ok && bb.Len() < 5<<20 { + if bb, ok := r.(byter); ok && bb.Len() < d.o.decodeBufsBelow && !d.o.limitToCap { bb2 := bb if debugDecoder { println("*bytes.Buffer detected, doing sync decode, len:", bb.Len()) } b := bb2.Bytes() var dst []byte - if cap(d.current.b) > 0 { - dst = d.current.b + if cap(d.syncStream.dstBuf) > 0 { + dst = d.syncStream.dstBuf[:0] } - dst, err := d.DecodeAll(b, dst[:0]) + dst, err := d.DecodeAll(b, dst) if err == nil { err = io.EOF } + // Save output buffer + d.syncStream.dstBuf = dst d.current.b = dst d.current.err = err d.current.flushed = true @@ -195,33 +213,40 @@ func (d *Decoder) Reset(r io.Reader) error { } return nil } - - if d.stream == nil { - d.stream = make(chan decodeStream, 1) - d.streamWg.Add(1) - go d.startStreamDecoder(d.stream) - } - // Remove current block. + d.stashDecoder() d.current.decodeOutput = decodeOutput{} d.current.err = nil - d.current.cancel = make(chan struct{}) d.current.flushed = false d.current.d = nil + d.syncStream.dstBuf = nil - d.stream <- decodeStream{ - r: r, - output: d.current.output, - cancel: d.current.cancel, + // Ensure no-one else is still running... + d.streamWg.Wait() + if d.frame == nil { + d.frame = newFrameDec(d.o) } + + if d.o.concurrent == 1 { + return d.startSyncDecoder(r) + } + + d.current.output = make(chan decodeOutput, d.o.concurrent) + ctx, cancel := context.WithCancel(context.Background()) + d.current.cancel = cancel + d.streamWg.Add(1) + go d.startStreamDecoder(ctx, r, d.current.output) + return nil } // drainOutput will drain the output until errEndOfStream is sent. func (d *Decoder) drainOutput() { if d.current.cancel != nil { - println("cancelling current") - close(d.current.cancel) + if debugDecoder { + println("cancelling current") + } + d.current.cancel() d.current.cancel = nil } if d.current.d != nil { @@ -243,12 +268,9 @@ func (d *Decoder) drainOutput() { } d.decoders <- v.d } - if v.err == errEndOfStream { - println("current flushed") - d.current.flushed = true - return - } } + d.current.output = nil + d.current.flushed = true } // WriteTo writes data to w until there's no more data to write or when an error occurs. @@ -287,19 +309,23 @@ func (d *Decoder) WriteTo(w io.Writer) (int64, error) { // DecodeAll can be used concurrently. // The Decoder concurrency limits will be respected. func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) { - if d.current.err == ErrDecoderClosed { + if d.decoders == nil { return dst, ErrDecoderClosed } // Grab a block decoder and frame decoder. block := <-d.decoders frame := block.localFrame + initialSize := len(dst) defer func() { if debugDecoder { printf("re-adding decoder: %p", block) } frame.rawInput = nil frame.bBuf = nil + if frame.history.decoders.br != nil { + frame.history.decoders.br.in = nil + } d.decoders <- block }() frame.bBuf = input @@ -307,34 +333,52 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) { for { frame.history.reset() err := frame.reset(&frame.bBuf) - if err == io.EOF { - if debugDecoder { - println("frame reset return EOF") + if err != nil { + if err == io.EOF { + if debugDecoder { + println("frame reset return EOF") + } + return dst, nil } - return dst, nil + return dst, err } if frame.DictionaryID != nil { dict, ok := d.dicts[*frame.DictionaryID] if !ok { return nil, ErrUnknownDictionary } + if debugDecoder { + println("setting dict", frame.DictionaryID) + } frame.history.setDict(&dict) } - if err != nil { - return dst, err - } - if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) { - return dst, ErrDecoderSizeExceeded + if frame.WindowSize > d.o.maxWindowSize { + if debugDecoder { + println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize) + } + return dst, ErrWindowSizeExceeded } - if frame.FrameContentSize > 0 && frame.FrameContentSize < 1<<30 { - // Never preallocate moe than 1 GB up front. + if frame.FrameContentSize != fcsUnknown { + if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)-initialSize) { + if debugDecoder { + println("decoder size exceeded; fcs:", frame.FrameContentSize, "> mcs:", d.o.maxDecodedSize-uint64(len(dst)-initialSize), "len:", len(dst)) + } + return dst, ErrDecoderSizeExceeded + } + if d.o.limitToCap && frame.FrameContentSize > uint64(cap(dst)-len(dst)) { + if debugDecoder { + println("decoder size exceeded; fcs:", frame.FrameContentSize, "> (cap-len)", cap(dst)-len(dst)) + } + return dst, ErrDecoderSizeExceeded + } if cap(dst)-len(dst) < int(frame.FrameContentSize) { - dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize)) + dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize)+compressedBlockOverAlloc) copy(dst2, dst) dst = dst2 } } - if cap(dst) == 0 { + + if cap(dst) == 0 && !d.o.limitToCap { // Allocate len(input) * 2 by default if nothing is provided // and we didn't get frame content size. size := len(input) * 2 @@ -352,6 +396,9 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) { if err != nil { return dst, err } + if uint64(len(dst)-initialSize) > d.o.maxDecodedSize { + return dst, ErrDecoderSizeExceeded + } if len(frame.bBuf) == 0 { if debugDecoder { println("frame dbuf empty") @@ -368,33 +415,176 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) { // If non-blocking mode is used the returned boolean will be false // if no data was available without blocking. func (d *Decoder) nextBlock(blocking bool) (ok bool) { - if d.current.d != nil { - if debugDecoder { - printf("re-adding current decoder %p", d.current.d) - } - d.decoders <- d.current.d - d.current.d = nil - } if d.current.err != nil { // Keep error state. - return blocking + return false + } + d.current.b = d.current.b[:0] + + // SYNC: + if d.syncStream.enabled { + if !blocking { + return false + } + ok = d.nextBlockSync() + if !ok { + d.stashDecoder() + } + return ok } + //ASYNC: + d.stashDecoder() if blocking { - d.current.decodeOutput = <-d.current.output + d.current.decodeOutput, ok = <-d.current.output } else { select { - case d.current.decodeOutput = <-d.current.output: + case d.current.decodeOutput, ok = <-d.current.output: default: return false } } + if !ok { + // This should not happen, so signal error state... + d.current.err = io.ErrUnexpectedEOF + return false + } + next := d.current.decodeOutput + if next.d != nil && next.d.async.newHist != nil { + d.current.crc.Reset() + } if debugDecoder { - println("got", len(d.current.b), "bytes, error:", d.current.err) + var tmp [4]byte + binary.LittleEndian.PutUint32(tmp[:], uint32(xxhash.Sum64(next.b))) + println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp) } + + if !d.o.ignoreChecksum && len(next.b) > 0 { + n, err := d.current.crc.Write(next.b) + if err == nil { + if n != len(next.b) { + d.current.err = io.ErrShortWrite + } + } + } + if next.err == nil && next.d != nil && len(next.d.checkCRC) != 0 { + got := d.current.crc.Sum64() + var tmp [4]byte + binary.LittleEndian.PutUint32(tmp[:], uint32(got)) + if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) { + if debugDecoder { + println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)") + } + d.current.err = ErrCRCMismatch + } else { + if debugDecoder { + println("CRC ok", tmp[:]) + } + } + } + return true } +func (d *Decoder) nextBlockSync() (ok bool) { + if d.current.d == nil { + d.current.d = <-d.decoders + } + for len(d.current.b) == 0 { + if !d.syncStream.inFrame { + d.frame.history.reset() + d.current.err = d.frame.reset(&d.syncStream.br) + if d.current.err != nil { + return false + } + if d.frame.DictionaryID != nil { + dict, ok := d.dicts[*d.frame.DictionaryID] + if !ok { + d.current.err = ErrUnknownDictionary + return false + } else { + d.frame.history.setDict(&dict) + } + } + if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize { + d.current.err = ErrDecoderSizeExceeded + return false + } + + d.syncStream.decodedFrame = 0 + d.syncStream.inFrame = true + } + d.current.err = d.frame.next(d.current.d) + if d.current.err != nil { + return false + } + d.frame.history.ensureBlock() + if debugDecoder { + println("History trimmed:", len(d.frame.history.b), "decoded already:", d.syncStream.decodedFrame) + } + histBefore := len(d.frame.history.b) + d.current.err = d.current.d.decodeBuf(&d.frame.history) + + if d.current.err != nil { + println("error after:", d.current.err) + return false + } + d.current.b = d.frame.history.b[histBefore:] + if debugDecoder { + println("history after:", len(d.frame.history.b)) + } + + // Check frame size (before CRC) + d.syncStream.decodedFrame += uint64(len(d.current.b)) + if d.syncStream.decodedFrame > d.frame.FrameContentSize { + if debugDecoder { + printf("DecodedFrame (%d) > FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize) + } + d.current.err = ErrFrameSizeExceeded + return false + } + + // Check FCS + if d.current.d.Last && d.frame.FrameContentSize != fcsUnknown && d.syncStream.decodedFrame != d.frame.FrameContentSize { + if debugDecoder { + printf("DecodedFrame (%d) != FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize) + } + d.current.err = ErrFrameSizeMismatch + return false + } + + // Update/Check CRC + if d.frame.HasCheckSum { + if !d.o.ignoreChecksum { + d.frame.crc.Write(d.current.b) + } + if d.current.d.Last { + if !d.o.ignoreChecksum { + d.current.err = d.frame.checkCRC() + } else { + d.current.err = d.frame.consumeCRC() + } + if d.current.err != nil { + println("CRC error:", d.current.err) + return false + } + } + } + d.syncStream.inFrame = !d.current.d.Last + } + return true +} + +func (d *Decoder) stashDecoder() { + if d.current.d != nil { + if debugDecoder { + printf("re-adding current decoder %p", d.current.d) + } + d.decoders <- d.current.d + d.current.d = nil + } +} + // Close will release all resources. // It is NOT possible to reuse the decoder after this. func (d *Decoder) Close() { @@ -402,10 +592,10 @@ func (d *Decoder) Close() { return } d.drainOutput() - if d.stream != nil { - close(d.stream) + if d.current.cancel != nil { + d.current.cancel() d.streamWg.Wait() - d.stream = nil + d.current.cancel = nil } if d.decoders != nil { close(d.decoders) @@ -456,100 +646,305 @@ type decodeOutput struct { err error } -type decodeStream struct { - r io.Reader - - // Blocks ready to be written to output. - output chan decodeOutput - - // cancel reading from the input - cancel chan struct{} +func (d *Decoder) startSyncDecoder(r io.Reader) error { + d.frame.history.reset() + d.syncStream.br = readerWrapper{r: r} + d.syncStream.inFrame = false + d.syncStream.enabled = true + d.syncStream.decodedFrame = 0 + return nil } -// errEndOfStream indicates that everything from the stream was read. -var errEndOfStream = errors.New("end-of-stream") - // Create Decoder: -// Spawn n block decoders. These accept tasks to decode a block. -// Create goroutine that handles stream processing, this will send history to decoders as they are available. -// Decoders update the history as they decode. -// When a block is returned: -// a) history is sent to the next decoder, -// b) content written to CRC. -// c) return data to WRITER. -// d) wait for next block to return data. -// Once WRITTEN, the decoders reused by the writer frame decoder for re-use. -func (d *Decoder) startStreamDecoder(inStream chan decodeStream) { +// ASYNC: +// Spawn 3 go routines. +// 0: Read frames and decode block literals. +// 1: Decode sequences. +// 2: Execute sequences, send to output. +func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output chan decodeOutput) { defer d.streamWg.Done() - frame := newFrameDec(d.o) - for stream := range inStream { - if debugDecoder { - println("got new stream") + br := readerWrapper{r: r} + + var seqDecode = make(chan *blockDec, d.o.concurrent) + var seqExecute = make(chan *blockDec, d.o.concurrent) + + // Async 1: Decode sequences... + go func() { + var hist history + var hasErr bool + + for block := range seqDecode { + if hasErr { + if block != nil { + seqExecute <- block + } + continue + } + if block.async.newHist != nil { + if debugDecoder { + println("Async 1: new history, recent:", block.async.newHist.recentOffsets) + } + hist.reset() + hist.decoders = block.async.newHist.decoders + hist.recentOffsets = block.async.newHist.recentOffsets + hist.windowSize = block.async.newHist.windowSize + if block.async.newHist.dict != nil { + hist.setDict(block.async.newHist.dict) + } + } + if block.err != nil || block.Type != blockTypeCompressed { + hasErr = block.err != nil + seqExecute <- block + continue + } + + hist.decoders.literals = block.async.literals + block.err = block.prepareSequences(block.async.seqData, &hist) + if debugDecoder && block.err != nil { + println("prepareSequences returned:", block.err) + } + hasErr = block.err != nil + if block.err == nil { + block.err = block.decodeSequences(&hist) + if debugDecoder && block.err != nil { + println("decodeSequences returned:", block.err) + } + hasErr = block.err != nil + // block.async.sequence = hist.decoders.seq[:hist.decoders.nSeqs] + block.async.seqSize = hist.decoders.seqSize + } + seqExecute <- block } - br := readerWrapper{r: stream.r} - decodeStream: - for { - frame.history.reset() - err := frame.reset(&br) - if debugDecoder && err != nil { - println("Frame decoder returned", err) + close(seqExecute) + hist.reset() + }() + + var wg sync.WaitGroup + wg.Add(1) + + // Async 3: Execute sequences... + frameHistCache := d.frame.history.b + go func() { + var hist history + var decodedFrame uint64 + var fcs uint64 + var hasErr bool + for block := range seqExecute { + out := decodeOutput{err: block.err, d: block} + if block.err != nil || hasErr { + hasErr = true + output <- out + continue } - if err == nil && frame.DictionaryID != nil { - dict, ok := d.dicts[*frame.DictionaryID] - if !ok { - err = ErrUnknownDictionary + if block.async.newHist != nil { + if debugDecoder { + println("Async 2: new history") + } + hist.reset() + hist.windowSize = block.async.newHist.windowSize + hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer + if block.async.newHist.dict != nil { + hist.setDict(block.async.newHist.dict) + } + + if cap(hist.b) < hist.allocFrameBuffer { + if cap(frameHistCache) >= hist.allocFrameBuffer { + hist.b = frameHistCache + } else { + hist.b = make([]byte, 0, hist.allocFrameBuffer) + println("Alloc history sized", hist.allocFrameBuffer) + } + } + hist.b = hist.b[:0] + fcs = block.async.fcs + decodedFrame = 0 + } + do := decodeOutput{err: block.err, d: block} + switch block.Type { + case blockTypeRLE: + if debugDecoder { + println("add rle block length:", block.RLESize) + } + + if cap(block.dst) < int(block.RLESize) { + if block.lowMem { + block.dst = make([]byte, block.RLESize) + } else { + block.dst = make([]byte, maxBlockSize) + } + } + block.dst = block.dst[:block.RLESize] + v := block.data[0] + for i := range block.dst { + block.dst[i] = v + } + hist.append(block.dst) + do.b = block.dst + case blockTypeRaw: + if debugDecoder { + println("add raw block length:", len(block.data)) + } + hist.append(block.data) + do.b = block.data + case blockTypeCompressed: + if debugDecoder { + println("execute with history length:", len(hist.b), "window:", hist.windowSize) + } + hist.decoders.seqSize = block.async.seqSize + hist.decoders.literals = block.async.literals + do.err = block.executeSequences(&hist) + hasErr = do.err != nil + if debugDecoder && hasErr { + println("executeSequences returned:", do.err) + } + do.b = block.dst + } + if !hasErr { + decodedFrame += uint64(len(do.b)) + if decodedFrame > fcs { + println("fcs exceeded", block.Last, fcs, decodedFrame) + do.err = ErrFrameSizeExceeded + hasErr = true + } else if block.Last && fcs != fcsUnknown && decodedFrame != fcs { + do.err = ErrFrameSizeMismatch + hasErr = true } else { - frame.history.setDict(&dict) + if debugDecoder { + println("fcs ok", block.Last, fcs, decodedFrame) + } } } - if err != nil { - stream.output <- decodeOutput{ - err: err, + output <- do + } + close(output) + frameHistCache = hist.b + wg.Done() + if debugDecoder { + println("decoder goroutines finished") + } + hist.reset() + }() + + var hist history +decodeStream: + for { + var hasErr bool + hist.reset() + decodeBlock := func(block *blockDec) { + if hasErr { + if block != nil { + seqDecode <- block } - break + return + } + if block.err != nil || block.Type != blockTypeCompressed { + hasErr = block.err != nil + seqDecode <- block + return } + + remain, err := block.decodeLiterals(block.data, &hist) + block.err = err + hasErr = block.err != nil + if err == nil { + block.async.literals = hist.decoders.literals + block.async.seqData = remain + } else if debugDecoder { + println("decodeLiterals error:", err) + } + seqDecode <- block + } + frame := d.frame + if debugDecoder { + println("New frame...") + } + var historySent bool + frame.history.reset() + err := frame.reset(&br) + if debugDecoder && err != nil { + println("Frame decoder returned", err) + } + if err == nil && frame.DictionaryID != nil { + dict, ok := d.dicts[*frame.DictionaryID] + if !ok { + err = ErrUnknownDictionary + } else { + frame.history.setDict(&dict) + } + } + if err == nil && d.frame.WindowSize > d.o.maxWindowSize { if debugDecoder { - println("starting frame decoder") - } - - // This goroutine will forward history between frames. - frame.frameDone.Add(1) - frame.initAsync() - - go frame.startDecoder(stream.output) - decodeFrame: - // Go through all blocks of the frame. - for { - dec := <-d.decoders - select { - case <-stream.cancel: - if !frame.sendErr(dec, io.EOF) { - // To not let the decoder dangle, send it back. - stream.output <- decodeOutput{d: dec} - } - break decodeStream - default: + println("decoder size exceeded, fws:", d.frame.WindowSize, "> mws:", d.o.maxWindowSize) + } + + err = ErrDecoderSizeExceeded + } + if err != nil { + select { + case <-ctx.Done(): + case dec := <-d.decoders: + dec.sendErr(err) + decodeBlock(dec) + } + break decodeStream + } + + // Go through all blocks of the frame. + for { + var dec *blockDec + select { + case <-ctx.Done(): + break decodeStream + case dec = <-d.decoders: + // Once we have a decoder, we MUST return it. + } + err := frame.next(dec) + if !historySent { + h := frame.history + if debugDecoder { + println("Alloc History:", h.allocFrameBuffer) + } + hist.reset() + if h.dict != nil { + hist.setDict(h.dict) + } + dec.async.newHist = &h + dec.async.fcs = frame.FrameContentSize + historySent = true + } else { + dec.async.newHist = nil + } + if debugDecoder && err != nil { + println("next block returned error:", err) + } + dec.err = err + dec.checkCRC = nil + if dec.Last && frame.HasCheckSum && err == nil { + crc, err := frame.rawInput.readSmall(4) + if err != nil { + println("CRC missing?", err) + dec.err = err } - err := frame.next(dec) - switch err { - case io.EOF: - // End of current frame, no error - println("EOF on next block") - break decodeFrame - case nil: - continue - default: - println("block decoder returned", err) - break decodeStream + var tmp [4]byte + copy(tmp[:], crc) + dec.checkCRC = tmp[:] + if debugDecoder { + println("found crc to check:", dec.checkCRC) } } - // All blocks have started decoding, check if there are more frames. - println("waiting for done") - frame.frameDone.Wait() - println("done waiting...") + err = dec.err + last := dec.Last + decodeBlock(dec) + if err != nil { + break decodeStream + } + if last { + break + } } - frame.frameDone.Wait() - println("Sending EOS") - stream.output <- decodeOutput{err: errEndOfStream} } + close(seqDecode) + wg.Wait() + hist.reset() + d.frame.history.b = frameHistCache } diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go index 95cc9b8b81f2..f42448e69c95 100644 --- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go +++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go @@ -14,21 +14,28 @@ type DOption func(*decoderOptions) error // options retains accumulated state of multiple options. type decoderOptions struct { - lowMem bool - concurrent int - maxDecodedSize uint64 - maxWindowSize uint64 - dicts []dict + lowMem bool + concurrent int + maxDecodedSize uint64 + maxWindowSize uint64 + dicts []dict + ignoreChecksum bool + limitToCap bool + decodeBufsBelow int } func (o *decoderOptions) setDefault() { *o = decoderOptions{ // use less ram: true for now, but may change. - lowMem: true, - concurrent: runtime.GOMAXPROCS(0), - maxWindowSize: MaxWindowSize, + lowMem: true, + concurrent: runtime.GOMAXPROCS(0), + maxWindowSize: MaxWindowSize, + decodeBufsBelow: 128 << 10, } - o.maxDecodedSize = 1 << 63 + if o.concurrent > 4 { + o.concurrent = 4 + } + o.maxDecodedSize = 64 << 30 } // WithDecoderLowmem will set whether to use a lower amount of memory, @@ -37,16 +44,25 @@ func WithDecoderLowmem(b bool) DOption { return func(o *decoderOptions) error { o.lowMem = b; return nil } } -// WithDecoderConcurrency will set the concurrency, -// meaning the maximum number of decoders to run concurrently. -// The value supplied must be at least 1. -// By default this will be set to GOMAXPROCS. +// WithDecoderConcurrency sets the number of created decoders. +// When decoding block with DecodeAll, this will limit the number +// of possible concurrently running decodes. +// When decoding streams, this will limit the number of +// inflight blocks. +// When decoding streams and setting maximum to 1, +// no async decoding will be done. +// When a value of 0 is provided GOMAXPROCS will be used. +// By default this will be set to 4 or GOMAXPROCS, whatever is lower. func WithDecoderConcurrency(n int) DOption { return func(o *decoderOptions) error { - if n <= 0 { + if n < 0 { return errors.New("concurrency must be at least 1") } - o.concurrent = n + if n == 0 { + o.concurrent = runtime.GOMAXPROCS(0) + } else { + o.concurrent = n + } return nil } } @@ -54,7 +70,7 @@ func WithDecoderConcurrency(n int) DOption { // WithDecoderMaxMemory allows to set a maximum decoded size for in-memory // non-streaming operations or maximum window size for streaming operations. // This can be used to control memory usage of potentially hostile content. -// Maximum and default is 1 << 63 bytes. +// Maximum is 1 << 63 bytes. Default is 64GiB. func WithDecoderMaxMemory(n uint64) DOption { return func(o *decoderOptions) error { if n == 0 { @@ -100,3 +116,34 @@ func WithDecoderMaxWindow(size uint64) DOption { return nil } } + +// WithDecodeAllCapLimit will limit DecodeAll to decoding cap(dst)-len(dst) bytes, +// or any size set in WithDecoderMaxMemory. +// This can be used to limit decoding to a specific maximum output size. +// Disabled by default. +func WithDecodeAllCapLimit(b bool) DOption { + return func(o *decoderOptions) error { + o.limitToCap = b + return nil + } +} + +// WithDecodeBuffersBelow will fully decode readers that have a +// `Bytes() []byte` and `Len() int` interface similar to bytes.Buffer. +// This typically uses less allocations but will have the full decompressed object in memory. +// Note that DecodeAllCapLimit will disable this, as well as giving a size of 0 or less. +// Default is 128KiB. +func WithDecodeBuffersBelow(size int) DOption { + return func(o *decoderOptions) error { + o.decodeBufsBelow = size + return nil + } +} + +// IgnoreChecksum allows to forcibly ignore checksum checking. +func IgnoreChecksum(b bool) DOption { + return func(o *decoderOptions) error { + o.ignoreChecksum = b + return nil + } +} diff --git a/vendor/github.com/klauspost/compress/zstd/enc_best.go b/vendor/github.com/klauspost/compress/zstd/enc_best.go index 96028ecd8366..dbbb88d92b39 100644 --- a/vendor/github.com/klauspost/compress/zstd/enc_best.go +++ b/vendor/github.com/klauspost/compress/zstd/enc_best.go @@ -32,6 +32,7 @@ type match struct { length int32 rep int32 est int32 + _ [12]byte // Aligned size to cache line: 4+4+4+4+4 bytes + 12 bytes padding = 32 bytes } const highScore = 25000 diff --git a/vendor/github.com/klauspost/compress/zstd/enc_better.go b/vendor/github.com/klauspost/compress/zstd/enc_better.go index 602c05ee0c4c..d70e3fd3d3e9 100644 --- a/vendor/github.com/klauspost/compress/zstd/enc_better.go +++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go @@ -156,8 +156,8 @@ encodeLoop: panic("offset0 was 0") } - nextHashS := hashLen(cv, betterShortTableBits, betterShortLen) nextHashL := hashLen(cv, betterLongTableBits, betterLongLen) + nextHashS := hashLen(cv, betterShortTableBits, betterShortLen) candidateL := e.longTable[nextHashL] candidateS := e.table[nextHashS] @@ -416,15 +416,23 @@ encodeLoop: // Try to find a better match by searching for a long match at the end of the current best match if s+matched < sLimit { + // Allow some bytes at the beginning to mismatch. + // Sweet spot is around 3 bytes, but depends on input. + // The skipped bytes are tested in Extend backwards, + // and still picked up as part of the match if they do. + const skipBeginning = 3 + nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen) - cv := load3232(src, s) + s2 := s + skipBeginning + cv := load3232(src, s2) candidateL := e.longTable[nextHashL] - coffsetL := candidateL.offset - e.cur - matched - if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) { + coffsetL := candidateL.offset - e.cur - matched + skipBeginning + if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) { // Found a long match, at least 4 bytes. - matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4 + matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4 if matchedNext > matched { t = coffsetL + s = s2 matched = matchedNext if debugMatches { println("long match at end-of-match") @@ -434,12 +442,13 @@ encodeLoop: // Check prev long... if true { - coffsetL = candidateL.prev - e.cur - matched - if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) { + coffsetL = candidateL.prev - e.cur - matched + skipBeginning + if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) { // Found a long match, at least 4 bytes. - matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4 + matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4 if matchedNext > matched { t = coffsetL + s = s2 matched = matchedNext if debugMatches { println("prev long match at end-of-match") @@ -518,8 +527,8 @@ encodeLoop: } // Store this, since we have it. - nextHashS := hashLen(cv, betterShortTableBits, betterShortLen) nextHashL := hashLen(cv, betterLongTableBits, betterLongLen) + nextHashS := hashLen(cv, betterShortTableBits, betterShortLen) // We have at least 4 byte match. // No need to check backwards. We come straight from a match @@ -674,8 +683,8 @@ encodeLoop: panic("offset0 was 0") } - nextHashS := hashLen(cv, betterShortTableBits, betterShortLen) nextHashL := hashLen(cv, betterLongTableBits, betterLongLen) + nextHashS := hashLen(cv, betterShortTableBits, betterShortLen) candidateL := e.longTable[nextHashL] candidateS := e.table[nextHashS] @@ -1047,8 +1056,8 @@ encodeLoop: } // Store this, since we have it. - nextHashS := hashLen(cv, betterShortTableBits, betterShortLen) nextHashL := hashLen(cv, betterLongTableBits, betterLongLen) + nextHashS := hashLen(cv, betterShortTableBits, betterShortLen) // We have at least 4 byte match. // No need to check backwards. We come straight from a match diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go index d6b3104240b0..1f4a9a245563 100644 --- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go +++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go @@ -127,8 +127,8 @@ encodeLoop: panic("offset0 was 0") } - nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen) nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen) + nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen) candidateL := e.longTable[nextHashL] candidateS := e.table[nextHashS] @@ -439,8 +439,8 @@ encodeLoop: var t int32 for { - nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen) nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen) + nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen) candidateL := e.longTable[nextHashL] candidateS := e.table[nextHashS] @@ -785,8 +785,8 @@ encodeLoop: panic("offset0 was 0") } - nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen) nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen) + nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen) candidateL := e.longTable[nextHashL] candidateS := e.table[nextHashS] @@ -969,7 +969,7 @@ encodeLoop: te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)} te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)} longHash1 := hashLen(cv0, dFastLongTableBits, dFastLongLen) - longHash2 := hashLen(cv0, dFastLongTableBits, dFastLongLen) + longHash2 := hashLen(cv1, dFastLongTableBits, dFastLongLen) e.longTable[longHash1] = te0 e.longTable[longHash2] = te1 e.markLongShardDirty(longHash1) @@ -1002,8 +1002,8 @@ encodeLoop: } // Store this, since we have it. - nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen) nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen) + nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen) // We have at least 4 byte match. // No need to check backwards. We come straight from a match @@ -1103,7 +1103,8 @@ func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) { } if allDirty || dirtyShardCnt > dLongTableShardCnt/2 { - copy(e.longTable[:], e.dictLongTable) + //copy(e.longTable[:], e.dictLongTable) + e.longTable = *(*[dFastLongTableSize]tableEntry)(e.dictLongTable) for i := range e.longTableShardDirty { e.longTableShardDirty[i] = false } @@ -1114,7 +1115,9 @@ func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) { continue } - copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize]) + // copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize]) + *(*[dLongTableShardSize]tableEntry)(e.longTable[i*dLongTableShardSize:]) = *(*[dLongTableShardSize]tableEntry)(e.dictLongTable[i*dLongTableShardSize:]) + e.longTableShardDirty[i] = false } } diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go index 5f08a2830233..181edc02b6c5 100644 --- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go +++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go @@ -85,7 +85,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) { // TEMPLATE const hashLog = tableBits // seems global, but would be nice to tweak. - const kSearchStrength = 7 + const kSearchStrength = 6 // nextEmit is where in src the next emitLiteral should start from. nextEmit := s @@ -334,7 +334,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) { // TEMPLATE const hashLog = tableBits // seems global, but would be nice to tweak. - const kSearchStrength = 8 + const kSearchStrength = 6 // nextEmit is where in src the next emitLiteral should start from. nextEmit := s @@ -871,7 +871,8 @@ func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) { const shardCnt = tableShardCnt const shardSize = tableShardSize if e.allDirty || dirtyShardCnt > shardCnt*4/6 { - copy(e.table[:], e.dictTable) + //copy(e.table[:], e.dictTable) + e.table = *(*[tableSize]tableEntry)(e.dictTable) for i := range e.tableShardDirty { e.tableShardDirty[i] = false } @@ -883,7 +884,8 @@ func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) { continue } - copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize]) + //copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize]) + *(*[shardSize]tableEntry)(e.table[i*shardSize:]) = *(*[shardSize]tableEntry)(e.dictTable[i*shardSize:]) e.tableShardDirty[i] = false } e.allDirty = false diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go index e6e315969b00..7aaaedb23e58 100644 --- a/vendor/github.com/klauspost/compress/zstd/encoder.go +++ b/vendor/github.com/klauspost/compress/zstd/encoder.go @@ -98,23 +98,25 @@ func (e *Encoder) Reset(w io.Writer) { if cap(s.filling) == 0 { s.filling = make([]byte, 0, e.o.blockSize) } - if cap(s.current) == 0 { - s.current = make([]byte, 0, e.o.blockSize) - } - if cap(s.previous) == 0 { - s.previous = make([]byte, 0, e.o.blockSize) + if e.o.concurrent > 1 { + if cap(s.current) == 0 { + s.current = make([]byte, 0, e.o.blockSize) + } + if cap(s.previous) == 0 { + s.previous = make([]byte, 0, e.o.blockSize) + } + s.current = s.current[:0] + s.previous = s.previous[:0] + if s.writing == nil { + s.writing = &blockEnc{lowMem: e.o.lowMem} + s.writing.init() + } + s.writing.initNewEncode() } if s.encoder == nil { s.encoder = e.o.encoder() } - if s.writing == nil { - s.writing = &blockEnc{lowMem: e.o.lowMem} - s.writing.init() - } - s.writing.initNewEncode() s.filling = s.filling[:0] - s.current = s.current[:0] - s.previous = s.previous[:0] s.encoder.Reset(e.o.dict, false) s.headerWritten = false s.eofWritten = false @@ -258,6 +260,46 @@ func (e *Encoder) nextBlock(final bool) error { return s.err } + // SYNC: + if e.o.concurrent == 1 { + src := s.filling + s.nInput += int64(len(s.filling)) + if debugEncoder { + println("Adding sync block,", len(src), "bytes, final:", final) + } + enc := s.encoder + blk := enc.Block() + blk.reset(nil) + enc.Encode(blk, src) + blk.last = final + if final { + s.eofWritten = true + } + + err := errIncompressible + // If we got the exact same number of literals as input, + // assume the literals cannot be compressed. + if len(src) != len(blk.literals) || len(src) != e.o.blockSize { + err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy) + } + switch err { + case errIncompressible: + if debugEncoder { + println("Storing incompressible block as raw") + } + blk.encodeRaw(src) + // In fast mode, we do not transfer offsets, so we don't have to deal with changing the. + case nil: + default: + s.err = err + return err + } + _, s.err = s.w.Write(blk.output) + s.nWritten += int64(len(blk.output)) + s.filling = s.filling[:0] + return s.err + } + // Move blocks forward. s.filling, s.current, s.previous = s.previous[:0], s.filling, s.current s.nInput += int64(len(s.current)) @@ -486,8 +528,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte { // If a non-single block is needed the encoder will reset again. e.encoders <- enc }() - // Use single segments when above minimum window and below 1MB. - single := len(src) < 1<<20 && len(src) > MinWindowSize + // Use single segments when above minimum window and below window size. + single := len(src) <= e.o.windowSize && len(src) > MinWindowSize if e.o.single != nil { single = *e.o.single } @@ -509,7 +551,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte { } // If we can do everything in one block, prefer that. - if len(src) <= maxCompressedBlockSize { + if len(src) <= e.o.blockSize { enc.Reset(e.o.dict, true) // Slightly faster with no history and everything in one block. if e.o.crc { diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go index 7d29e1d689ee..a7c5e1aac432 100644 --- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go +++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go @@ -24,6 +24,7 @@ type encoderOptions struct { allLitEntropy bool customWindow bool customALEntropy bool + customBlockSize bool lowMem bool dict *dict } @@ -33,7 +34,7 @@ func (o *encoderOptions) setDefault() { concurrent: runtime.GOMAXPROCS(0), crc: true, single: nil, - blockSize: 1 << 16, + blockSize: maxCompressedBlockSize, windowSize: 8 << 20, level: SpeedDefault, allLitEntropy: true, @@ -75,6 +76,7 @@ func WithEncoderCRC(b bool) EOption { // WithEncoderConcurrency will set the concurrency, // meaning the maximum number of encoders to run concurrently. // The value supplied must be at least 1. +// For streams, setting a value of 1 will disable async compression. // By default this will be set to GOMAXPROCS. func WithEncoderConcurrency(n int) EOption { return func(o *encoderOptions) error { @@ -106,6 +108,7 @@ func WithWindowSize(n int) EOption { o.customWindow = true if o.blockSize > o.windowSize { o.blockSize = o.windowSize + o.customBlockSize = true } return nil } @@ -188,10 +191,9 @@ func EncoderLevelFromZstd(level int) EncoderLevel { return SpeedDefault case level >= 6 && level < 10: return SpeedBetterCompression - case level >= 10: + default: return SpeedBestCompression } - return SpeedDefault } // String provides a string representation of the compression level. @@ -222,6 +224,9 @@ func WithEncoderLevel(l EncoderLevel) EOption { switch o.level { case SpeedFastest: o.windowSize = 4 << 20 + if !o.customBlockSize { + o.blockSize = 1 << 16 + } case SpeedDefault: o.windowSize = 8 << 20 case SpeedBetterCompression: @@ -278,7 +283,7 @@ func WithNoEntropyCompression(b bool) EOption { // a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range. // For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB. // This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations. -// If this is not specified, block encodes will automatically choose this based on the input size. +// If this is not specified, block encodes will automatically choose this based on the input size and the window size. // This setting has no effect on streamed encodes. func WithSingleSegment(b bool) EOption { return func(o *encoderOptions) error { diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go index 989c79f8c315..b6c5054176a6 100644 --- a/vendor/github.com/klauspost/compress/zstd/framedec.go +++ b/vendor/github.com/klauspost/compress/zstd/framedec.go @@ -8,23 +8,17 @@ import ( "bytes" "encoding/hex" "errors" - "hash" "io" - "sync" "github.com/klauspost/compress/zstd/internal/xxhash" ) type frameDec struct { - o decoderOptions - crc hash.Hash64 - offset int64 + o decoderOptions + crc *xxhash.Digest WindowSize uint64 - // In order queue of blocks being decoded. - decoding chan *blockDec - // Frame history passed between blocks history history @@ -34,15 +28,10 @@ type frameDec struct { bBuf byteBuf FrameContentSize uint64 - frameDone sync.WaitGroup DictionaryID *uint32 HasCheckSum bool SingleSegment bool - - // asyncRunning indicates whether the async routine processes input on 'decoding'. - asyncRunningMu sync.Mutex - asyncRunning bool } const ( @@ -117,7 +106,7 @@ func (d *frameDec) reset(br byteBuffer) error { } n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24) println("Skipping frame with", n, "bytes.") - err = br.skipN(int(n)) + err = br.skipN(int64(n)) if err != nil { if debugDecoder { println("Reading discarded frame", err) @@ -208,7 +197,7 @@ func (d *frameDec) reset(br byteBuffer) error { default: fcsSize = 1 << v } - d.FrameContentSize = 0 + d.FrameContentSize = fcsUnknown if fcsSize > 0 { b, err := br.readSmall(fcsSize) if err != nil { @@ -229,9 +218,10 @@ func (d *frameDec) reset(br byteBuffer) error { d.FrameContentSize = uint64(d1) | (uint64(d2) << 32) } if debugDecoder { - println("field size bits:", v, "fcsSize:", fcsSize, "FrameContentSize:", d.FrameContentSize, hex.EncodeToString(b[:fcsSize]), "singleseg:", d.SingleSegment, "window:", d.WindowSize) + println("Read FCS:", d.FrameContentSize) } } + // Move this to shared. d.HasCheckSum = fhd&(1<<2) != 0 if d.HasCheckSum { @@ -241,20 +231,27 @@ func (d *frameDec) reset(br byteBuffer) error { d.crc.Reset() } + if d.WindowSize > d.o.maxWindowSize { + if debugDecoder { + printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize) + } + return ErrWindowSizeExceeded + } + if d.WindowSize == 0 && d.SingleSegment { // We may not need window in this case. d.WindowSize = d.FrameContentSize if d.WindowSize < MinWindowSize { d.WindowSize = MinWindowSize } - } - - if d.WindowSize > uint64(d.o.maxWindowSize) { - if debugDecoder { - printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize) + if d.WindowSize > d.o.maxDecodedSize { + if debugDecoder { + printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize) + } + return ErrDecoderSizeExceeded } - return ErrWindowSizeExceeded } + // The minimum Window_Size is 1 KB. if d.WindowSize < MinWindowSize { if debugDecoder { @@ -263,11 +260,18 @@ func (d *frameDec) reset(br byteBuffer) error { return ErrWindowSizeTooSmall } d.history.windowSize = int(d.WindowSize) - if d.o.lowMem && d.history.windowSize < maxBlockSize { - d.history.maxSize = d.history.windowSize * 2 + if !d.o.lowMem || d.history.windowSize < maxBlockSize { + // Alloc 2x window size if not low-mem, or very small window size. + d.history.allocFrameBuffer = d.history.windowSize * 2 } else { - d.history.maxSize = d.history.windowSize + maxBlockSize + // Alloc with one additional block + d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize } + + if debugDecoder { + println("Frame: Dict:", d.DictionaryID, "FrameContentSize:", d.FrameContentSize, "singleseg:", d.SingleSegment, "window:", d.WindowSize, "crc:", d.HasCheckSum) + } + // history contains input - maybe we do something d.rawInput = br return nil @@ -276,62 +280,24 @@ func (d *frameDec) reset(br byteBuffer) error { // next will start decoding the next block from stream. func (d *frameDec) next(block *blockDec) error { if debugDecoder { - printf("decoding new block %p:%p", block, block.data) + println("decoding new block") } err := block.reset(d.rawInput, d.WindowSize) if err != nil { println("block error:", err) // Signal the frame decoder we have a problem. - d.sendErr(block, err) + block.sendErr(err) return err } - block.input <- struct{}{} - if debugDecoder { - println("next block:", block) - } - d.asyncRunningMu.Lock() - defer d.asyncRunningMu.Unlock() - if !d.asyncRunning { - return nil - } - if block.Last { - // We indicate the frame is done by sending io.EOF - d.decoding <- block - return io.EOF - } - d.decoding <- block return nil } -// sendEOF will queue an error block on the frame. -// This will cause the frame decoder to return when it encounters the block. -// Returns true if the decoder was added. -func (d *frameDec) sendErr(block *blockDec, err error) bool { - d.asyncRunningMu.Lock() - defer d.asyncRunningMu.Unlock() - if !d.asyncRunning { - return false - } - - println("sending error", err.Error()) - block.sendErr(err) - d.decoding <- block - return true -} - // checkCRC will check the checksum if the frame has one. // Will return ErrCRCMismatch if crc check failed, otherwise nil. func (d *frameDec) checkCRC() error { if !d.HasCheckSum { return nil } - var tmp [4]byte - got := d.crc.Sum64() - // Flip to match file order. - tmp[0] = byte(got >> 0) - tmp[1] = byte(got >> 8) - tmp[2] = byte(got >> 16) - tmp[3] = byte(got >> 24) // We can overwrite upper tmp now want, err := d.rawInput.readSmall(4) @@ -340,6 +306,18 @@ func (d *frameDec) checkCRC() error { return err } + if d.o.ignoreChecksum { + return nil + } + + var tmp [4]byte + got := d.crc.Sum64() + // Flip to match file order. + tmp[0] = byte(got >> 0) + tmp[1] = byte(got >> 8) + tmp[2] = byte(got >> 16) + tmp[3] = byte(got >> 24) + if !bytes.Equal(tmp[:], want) { if debugDecoder { println("CRC Check Failed:", tmp[:], "!=", want) @@ -352,133 +330,52 @@ func (d *frameDec) checkCRC() error { return nil } -func (d *frameDec) initAsync() { - if !d.o.lowMem && !d.SingleSegment { - // set max extra size history to 2MB. - d.history.maxSize = d.history.windowSize + maxBlockSize - } - // re-alloc if more than one extra block size. - if d.o.lowMem && cap(d.history.b) > d.history.maxSize+maxBlockSize { - d.history.b = make([]byte, 0, d.history.maxSize) - } - if cap(d.history.b) < d.history.maxSize { - d.history.b = make([]byte, 0, d.history.maxSize) - } - if cap(d.decoding) < d.o.concurrent { - d.decoding = make(chan *blockDec, d.o.concurrent) - } - if debugDecoder { - h := d.history - printf("history init. len: %d, cap: %d", len(h.b), cap(h.b)) - } - d.asyncRunningMu.Lock() - d.asyncRunning = true - d.asyncRunningMu.Unlock() -} - -// startDecoder will start decoding blocks and write them to the writer. -// The decoder will stop as soon as an error occurs or at end of frame. -// When the frame has finished decoding the *bufio.Reader -// containing the remaining input will be sent on frameDec.frameDone. -func (d *frameDec) startDecoder(output chan decodeOutput) { - written := int64(0) - - defer func() { - d.asyncRunningMu.Lock() - d.asyncRunning = false - d.asyncRunningMu.Unlock() - - // Drain the currently decoding. - d.history.error = true - flushdone: - for { - select { - case b := <-d.decoding: - b.history <- &d.history - output <- <-b.result - default: - break flushdone - } - } - println("frame decoder done, signalling done") - d.frameDone.Done() - }() - // Get decoder for first block. - block := <-d.decoding - block.history <- &d.history - for { - var next *blockDec - // Get result - r := <-block.result - if r.err != nil { - println("Result contained error", r.err) - output <- r - return - } - if debugDecoder { - println("got result, from ", d.offset, "to", d.offset+int64(len(r.b))) - d.offset += int64(len(r.b)) - } - if !block.Last { - // Send history to next block - select { - case next = <-d.decoding: - if debugDecoder { - println("Sending ", len(d.history.b), "bytes as history") - } - next.history <- &d.history - default: - // Wait until we have sent the block, so - // other decoders can potentially get the decoder. - next = nil - } - } - - // Add checksum, async to decoding. - if d.HasCheckSum { - n, err := d.crc.Write(r.b) - if err != nil { - r.err = err - if n != len(r.b) { - r.err = io.ErrShortWrite - } - output <- r - return - } - } - written += int64(len(r.b)) - if d.SingleSegment && uint64(written) > d.FrameContentSize { - println("runDecoder: single segment and", uint64(written), ">", d.FrameContentSize) - r.err = ErrFrameSizeExceeded - output <- r - return - } - if block.Last { - r.err = d.checkCRC() - output <- r - return - } - output <- r - if next == nil { - // There was no decoder available, we wait for one now that we have sent to the writer. - if debugDecoder { - println("Sending ", len(d.history.b), " bytes as history") - } - next = <-d.decoding - next.history <- &d.history +// consumeCRC reads the checksum data if the frame has one. +func (d *frameDec) consumeCRC() error { + if d.HasCheckSum { + _, err := d.rawInput.readSmall(4) + if err != nil { + println("CRC missing?", err) + return err } - block = next } + + return nil } -// runDecoder will create a sync decoder that will decode a block of data. +// runDecoder will run the decoder for the remainder of the frame. func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) { saved := d.history.b // We use the history for output to avoid copying it. d.history.b = dst + d.history.ignoreBuffer = len(dst) // Store input length, so we only check new data. crcStart := len(dst) + d.history.decoders.maxSyncLen = 0 + if d.o.limitToCap { + d.history.decoders.maxSyncLen = uint64(cap(dst) - len(dst)) + } + if d.FrameContentSize != fcsUnknown { + if !d.o.limitToCap || d.FrameContentSize+uint64(len(dst)) < d.history.decoders.maxSyncLen { + d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst)) + } + if d.history.decoders.maxSyncLen > d.o.maxDecodedSize { + if debugDecoder { + println("maxSyncLen:", d.history.decoders.maxSyncLen, "> maxDecodedSize:", d.o.maxDecodedSize) + } + return dst, ErrDecoderSizeExceeded + } + if debugDecoder { + println("maxSyncLen:", d.history.decoders.maxSyncLen) + } + if !d.o.limitToCap && uint64(cap(dst)) < d.history.decoders.maxSyncLen { + // Alloc for output + dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc) + copy(dst2, dst) + dst = dst2 + } + } var err error for { err = dec.reset(d.rawInput, d.WindowSize) @@ -489,29 +386,47 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) { println("next block:", dec) } err = dec.decodeBuf(&d.history) - if err != nil || dec.Last { + if err != nil { break } - if uint64(len(d.history.b)) > d.o.maxDecodedSize { + if uint64(len(d.history.b)-crcStart) > d.o.maxDecodedSize { + println("runDecoder: maxDecodedSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.o.maxDecodedSize) err = ErrDecoderSizeExceeded break } - if d.SingleSegment && uint64(len(d.history.b)) > d.o.maxDecodedSize { - println("runDecoder: single segment and", uint64(len(d.history.b)), ">", d.o.maxDecodedSize) + if d.o.limitToCap && len(d.history.b) > cap(dst) { + println("runDecoder: cap exceeded", uint64(len(d.history.b)), ">", cap(dst)) + err = ErrDecoderSizeExceeded + break + } + if uint64(len(d.history.b)-crcStart) > d.FrameContentSize { + println("runDecoder: FrameContentSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.FrameContentSize) err = ErrFrameSizeExceeded break } + if dec.Last { + break + } + if debugDecoder { + println("runDecoder: FrameContentSize", uint64(len(d.history.b)-crcStart), "<=", d.FrameContentSize) + } } dst = d.history.b if err == nil { - if d.HasCheckSum { - var n int - n, err = d.crc.Write(dst[crcStart:]) - if err == nil { - if n != len(dst)-crcStart { - err = io.ErrShortWrite - } else { - err = d.checkCRC() + if d.FrameContentSize != fcsUnknown && uint64(len(d.history.b)-crcStart) != d.FrameContentSize { + err = ErrFrameSizeMismatch + } else if d.HasCheckSum { + if d.o.ignoreChecksum { + err = d.consumeCRC() + } else { + var n int + n, err = d.crc.Write(dst[crcStart:]) + if err == nil { + if n != len(dst)-crcStart { + err = io.ErrShortWrite + } else { + err = d.checkCRC() + } } } } diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go index bb3d4fd6c312..2f8860a722b8 100644 --- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go @@ -5,8 +5,10 @@ package zstd import ( + "encoding/binary" "errors" "fmt" + "io" ) const ( @@ -178,10 +180,32 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error { return fmt.Errorf("corruption detected (total %d != %d)", gotTotal, 1<> 3) - // println(s.norm[:s.symbolLen], s.symbolLen) return s.buildDtable() } +func (s *fseDecoder) mustReadFrom(r io.Reader) { + fatalErr := func(err error) { + if err != nil { + panic(err) + } + } + // dt [maxTablesize]decSymbol // Decompression table. + // symbolLen uint16 // Length of active part of the symbol table. + // actualTableLog uint8 // Selected tablelog. + // maxBits uint8 // Maximum number of additional bits + // // used for table creation to avoid allocations. + // stateTable [256]uint16 + // norm [maxSymbolValue + 1]int16 + // preDefined bool + fatalErr(binary.Read(r, binary.LittleEndian, &s.dt)) + fatalErr(binary.Read(r, binary.LittleEndian, &s.symbolLen)) + fatalErr(binary.Read(r, binary.LittleEndian, &s.actualTableLog)) + fatalErr(binary.Read(r, binary.LittleEndian, &s.maxBits)) + fatalErr(binary.Read(r, binary.LittleEndian, &s.stateTable)) + fatalErr(binary.Read(r, binary.LittleEndian, &s.norm)) + fatalErr(binary.Read(r, binary.LittleEndian, &s.preDefined)) +} + // decSymbol contains information about a state entry, // Including the state offset base, the output symbol and // the number of bits to read for the low part of the destination state. @@ -204,18 +228,10 @@ func (d decSymbol) newState() uint16 { return uint16(d >> 16) } -func (d decSymbol) baseline() uint32 { - return uint32(d >> 32) -} - func (d decSymbol) baselineInt() int { return int(d >> 32) } -func (d *decSymbol) set(nbits, addBits uint8, newState uint16, baseline uint32) { - *d = decSymbol(nbits) | (decSymbol(addBits) << 8) | (decSymbol(newState) << 16) | (decSymbol(baseline) << 32) -} - func (d *decSymbol) setNBits(nBits uint8) { const mask = 0xffffffffffffff00 *d = (*d & mask) | decSymbol(nBits) @@ -231,11 +247,6 @@ func (d *decSymbol) setNewState(state uint16) { *d = (*d & mask) | decSymbol(state)<<16 } -func (d *decSymbol) setBaseline(baseline uint32) { - const mask = 0xffffffff - *d = (*d & mask) | decSymbol(baseline)<<32 -} - func (d *decSymbol) setExt(addBits uint8, baseline uint32) { const mask = 0xffff00ff *d = (*d & mask) | (decSymbol(addBits) << 8) | (decSymbol(baseline) << 32) @@ -257,68 +268,6 @@ func (s *fseDecoder) setRLE(symbol decSymbol) { s.dt[0] = symbol } -// buildDtable will build the decoding table. -func (s *fseDecoder) buildDtable() error { - tableSize := uint32(1 << s.actualTableLog) - highThreshold := tableSize - 1 - symbolNext := s.stateTable[:256] - - // Init, lay down lowprob symbols - { - for i, v := range s.norm[:s.symbolLen] { - if v == -1 { - s.dt[highThreshold].setAddBits(uint8(i)) - highThreshold-- - symbolNext[i] = 1 - } else { - symbolNext[i] = uint16(v) - } - } - } - // Spread symbols - { - tableMask := tableSize - 1 - step := tableStep(tableSize) - position := uint32(0) - for ss, v := range s.norm[:s.symbolLen] { - for i := 0; i < int(v); i++ { - s.dt[position].setAddBits(uint8(ss)) - position = (position + step) & tableMask - for position > highThreshold { - // lowprob area - position = (position + step) & tableMask - } - } - } - if position != 0 { - // position must reach all cells once, otherwise normalizedCounter is incorrect - return errors.New("corrupted input (position != 0)") - } - } - - // Build Decoding table - { - tableSize := uint16(1 << s.actualTableLog) - for u, v := range s.dt[:tableSize] { - symbol := v.addBits() - nextState := symbolNext[symbol] - symbolNext[symbol] = nextState + 1 - nBits := s.actualTableLog - byte(highBits(uint32(nextState))) - s.dt[u&maxTableMask].setNBits(nBits) - newState := (nextState << nBits) - tableSize - if newState > tableSize { - return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize) - } - if newState == uint16(u) && nBits == 0 { - // Seems weird that this is possible with nbits > 0. - return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u) - } - s.dt[u&maxTableMask].setNewState(newState) - } - } - return nil -} - // transform will transform the decoder table into a table usable for // decoding without having to apply the transformation while decoding. // The state will contain the base value and the number of bits to read. @@ -352,34 +301,7 @@ func (s *fseState) init(br *bitReader, tableLog uint8, dt []decSymbol) { s.state = dt[br.getBits(tableLog)] } -// next returns the current symbol and sets the next state. -// At least tablelog bits must be available in the bit reader. -func (s *fseState) next(br *bitReader) { - lowBits := uint16(br.getBits(s.state.nbBits())) - s.state = s.dt[s.state.newState()+lowBits] -} - -// finished returns true if all bits have been read from the bitstream -// and the next state would require reading bits from the input. -func (s *fseState) finished(br *bitReader) bool { - return br.finished() && s.state.nbBits() > 0 -} - -// final returns the current state symbol without decoding the next. -func (s *fseState) final() (int, uint8) { - return s.state.baselineInt(), s.state.addBits() -} - // final returns the current state symbol without decoding the next. func (s decSymbol) final() (int, uint8) { return s.baselineInt(), s.addBits() } - -// nextFast returns the next symbol and sets the next state. -// This can only be used if no symbols are 0 bits. -// At least tablelog bits must be available in the bit reader. -func (s *fseState) nextFast(br *bitReader) (uint32, uint8) { - lowBits := br.get16BitsFast(s.state.nbBits()) - s.state = s.dt[s.state.newState()+lowBits] - return s.state.baseline(), s.state.addBits() -} diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go new file mode 100644 index 000000000000..d04a829b0a0e --- /dev/null +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go @@ -0,0 +1,65 @@ +//go:build amd64 && !appengine && !noasm && gc +// +build amd64,!appengine,!noasm,gc + +package zstd + +import ( + "fmt" +) + +type buildDtableAsmContext struct { + // inputs + stateTable *uint16 + norm *int16 + dt *uint64 + + // outputs --- set by the procedure in the case of error; + // for interpretation please see the error handling part below + errParam1 uint64 + errParam2 uint64 +} + +// buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable. +// Function returns non-zero exit code on error. +// +//go:noescape +func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int + +// please keep in sync with _generate/gen_fse.go +const ( + errorCorruptedNormalizedCounter = 1 + errorNewStateTooBig = 2 + errorNewStateNoBits = 3 +) + +// buildDtable will build the decoding table. +func (s *fseDecoder) buildDtable() error { + ctx := buildDtableAsmContext{ + stateTable: &s.stateTable[0], + norm: &s.norm[0], + dt: (*uint64)(&s.dt[0]), + } + code := buildDtable_asm(s, &ctx) + + if code != 0 { + switch code { + case errorCorruptedNormalizedCounter: + position := ctx.errParam1 + return fmt.Errorf("corrupted input (position=%d, expected 0)", position) + + case errorNewStateTooBig: + newState := decSymbol(ctx.errParam1) + size := ctx.errParam2 + return fmt.Errorf("newState (%d) outside table size (%d)", newState, size) + + case errorNewStateNoBits: + newState := decSymbol(ctx.errParam1) + oldState := decSymbol(ctx.errParam2) + return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, oldState) + + default: + return fmt.Errorf("buildDtable_asm returned unhandled nonzero code = %d", code) + } + } + return nil +} diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s new file mode 100644 index 000000000000..bcde39869535 --- /dev/null +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s @@ -0,0 +1,126 @@ +// Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT. + +//go:build !appengine && !noasm && gc && !noasm + +// func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int +TEXT ·buildDtable_asm(SB), $0-24 + MOVQ ctx+8(FP), CX + MOVQ s+0(FP), DI + + // Load values + MOVBQZX 4098(DI), DX + XORQ AX, AX + BTSQ DX, AX + MOVQ (CX), BX + MOVQ 16(CX), SI + LEAQ -1(AX), R8 + MOVQ 8(CX), CX + MOVWQZX 4096(DI), DI + + // End load values + // Init, lay down lowprob symbols + XORQ R9, R9 + JMP init_main_loop_condition + +init_main_loop: + MOVWQSX (CX)(R9*2), R10 + CMPW R10, $-1 + JNE do_not_update_high_threshold + MOVB R9, 1(SI)(R8*8) + DECQ R8 + MOVQ $0x0000000000000001, R10 + +do_not_update_high_threshold: + MOVW R10, (BX)(R9*2) + INCQ R9 + +init_main_loop_condition: + CMPQ R9, DI + JL init_main_loop + + // Spread symbols + // Calculate table step + MOVQ AX, R9 + SHRQ $0x01, R9 + MOVQ AX, R10 + SHRQ $0x03, R10 + LEAQ 3(R9)(R10*1), R9 + + // Fill add bits values + LEAQ -1(AX), R10 + XORQ R11, R11 + XORQ R12, R12 + JMP spread_main_loop_condition + +spread_main_loop: + XORQ R13, R13 + MOVWQSX (CX)(R12*2), R14 + JMP spread_inner_loop_condition + +spread_inner_loop: + MOVB R12, 1(SI)(R11*8) + +adjust_position: + ADDQ R9, R11 + ANDQ R10, R11 + CMPQ R11, R8 + JG adjust_position + INCQ R13 + +spread_inner_loop_condition: + CMPQ R13, R14 + JL spread_inner_loop + INCQ R12 + +spread_main_loop_condition: + CMPQ R12, DI + JL spread_main_loop + TESTQ R11, R11 + JZ spread_check_ok + MOVQ ctx+8(FP), AX + MOVQ R11, 24(AX) + MOVQ $+1, ret+16(FP) + RET + +spread_check_ok: + // Build Decoding table + XORQ DI, DI + +build_table_main_table: + MOVBQZX 1(SI)(DI*8), CX + MOVWQZX (BX)(CX*2), R8 + LEAQ 1(R8), R9 + MOVW R9, (BX)(CX*2) + MOVQ R8, R9 + BSRQ R9, R9 + MOVQ DX, CX + SUBQ R9, CX + SHLQ CL, R8 + SUBQ AX, R8 + MOVB CL, (SI)(DI*8) + MOVW R8, 2(SI)(DI*8) + CMPQ R8, AX + JLE build_table_check1_ok + MOVQ ctx+8(FP), CX + MOVQ R8, 24(CX) + MOVQ AX, 32(CX) + MOVQ $+2, ret+16(FP) + RET + +build_table_check1_ok: + TESTB CL, CL + JNZ build_table_check2_ok + CMPW R8, DI + JNE build_table_check2_ok + MOVQ ctx+8(FP), AX + MOVQ R8, 24(AX) + MOVQ DI, 32(AX) + MOVQ $+3, ret+16(FP) + RET + +build_table_check2_ok: + INCQ DI + CMPQ DI, AX + JL build_table_main_table + MOVQ $+0, ret+16(FP) + RET diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go new file mode 100644 index 000000000000..332e51fe44fa --- /dev/null +++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go @@ -0,0 +1,72 @@ +//go:build !amd64 || appengine || !gc || noasm +// +build !amd64 appengine !gc noasm + +package zstd + +import ( + "errors" + "fmt" +) + +// buildDtable will build the decoding table. +func (s *fseDecoder) buildDtable() error { + tableSize := uint32(1 << s.actualTableLog) + highThreshold := tableSize - 1 + symbolNext := s.stateTable[:256] + + // Init, lay down lowprob symbols + { + for i, v := range s.norm[:s.symbolLen] { + if v == -1 { + s.dt[highThreshold].setAddBits(uint8(i)) + highThreshold-- + symbolNext[i] = 1 + } else { + symbolNext[i] = uint16(v) + } + } + } + + // Spread symbols + { + tableMask := tableSize - 1 + step := tableStep(tableSize) + position := uint32(0) + for ss, v := range s.norm[:s.symbolLen] { + for i := 0; i < int(v); i++ { + s.dt[position].setAddBits(uint8(ss)) + position = (position + step) & tableMask + for position > highThreshold { + // lowprob area + position = (position + step) & tableMask + } + } + } + if position != 0 { + // position must reach all cells once, otherwise normalizedCounter is incorrect + return errors.New("corrupted input (position != 0)") + } + } + + // Build Decoding table + { + tableSize := uint16(1 << s.actualTableLog) + for u, v := range s.dt[:tableSize] { + symbol := v.addBits() + nextState := symbolNext[symbol] + symbolNext[symbol] = nextState + 1 + nBits := s.actualTableLog - byte(highBits(uint32(nextState))) + s.dt[u&maxTableMask].setNBits(nBits) + newState := (nextState << nBits) - tableSize + if newState > tableSize { + return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize) + } + if newState == uint16(u) && nBits == 0 { + // Seems weird that this is possible with nbits > 0. + return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u) + } + s.dt[u&maxTableMask].setNewState(newState) + } + } + return nil +} diff --git a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go index 5442061b18df..ab26326a8ff8 100644 --- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go +++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go @@ -76,21 +76,6 @@ func (s *fseEncoder) HistogramFinished(maxSymbol uint8, maxCount int) { s.clearCount = maxCount != 0 } -// prepare will prepare and allocate scratch tables used for both compression and decompression. -func (s *fseEncoder) prepare() (*fseEncoder, error) { - if s == nil { - s = &fseEncoder{} - } - s.useRLE = false - if s.clearCount && s.maxCount == 0 { - for i := range s.count { - s.count[i] = 0 - } - s.clearCount = false - } - return s, nil -} - // allocCtable will allocate tables needed for compression. // If existing tables a re big enough, they are simply re-used. func (s *fseEncoder) allocCtable() { @@ -709,14 +694,6 @@ func (c *cState) init(bw *bitWriter, ct *cTable, first symbolTransform) { c.state = c.stateTable[lu] } -// encode the output symbol provided and write it to the bitstream. -func (c *cState) encode(symbolTT symbolTransform) { - nbBitsOut := (uint32(c.state) + symbolTT.deltaNbBits) >> 16 - dstState := int32(c.state>>(nbBitsOut&15)) + int32(symbolTT.deltaFindState) - c.bw.addBits16NC(c.state, uint8(nbBitsOut)) - c.state = c.stateTable[dstState] -} - // flush will write the tablelog to the output and flush the remaining full bytes. func (c *cState) flush(tableLog uint8) { c.bw.flush32() diff --git a/vendor/github.com/klauspost/compress/zstd/hash.go b/vendor/github.com/klauspost/compress/zstd/hash.go index cf33f29a1b48..5d73c21ebdd4 100644 --- a/vendor/github.com/klauspost/compress/zstd/hash.go +++ b/vendor/github.com/klauspost/compress/zstd/hash.go @@ -33,9 +33,3 @@ func hashLen(u uint64, length, mls uint8) uint32 { return (uint32(u) * prime4bytes) >> (32 - length) } } - -// hash3 returns the hash of the lower 3 bytes of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <32. -func hash3(u uint32, h uint8) uint32 { - return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31) -} diff --git a/vendor/github.com/klauspost/compress/zstd/history.go b/vendor/github.com/klauspost/compress/zstd/history.go index f783e32d251b..09164856d222 100644 --- a/vendor/github.com/klauspost/compress/zstd/history.go +++ b/vendor/github.com/klauspost/compress/zstd/history.go @@ -10,40 +10,48 @@ import ( // history contains the information transferred between blocks. type history struct { - b []byte - huffTree *huff0.Scratch - recentOffsets [3]int + // Literal decompression + huffTree *huff0.Scratch + + // Sequence decompression decoders sequenceDecs - windowSize int - maxSize int - error bool - dict *dict + recentOffsets [3]int + + // History buffer... + b []byte + + // ignoreBuffer is meant to ignore a number of bytes + // when checking for matches in history + ignoreBuffer int + + windowSize int + allocFrameBuffer int // needed? + error bool + dict *dict } // reset will reset the history to initial state of a frame. // The history must already have been initialized to the desired size. func (h *history) reset() { h.b = h.b[:0] + h.ignoreBuffer = 0 h.error = false h.recentOffsets = [3]int{1, 4, 8} - if f := h.decoders.litLengths.fse; f != nil && !f.preDefined { - fseDecoderPool.Put(f) - } - if f := h.decoders.offsets.fse; f != nil && !f.preDefined { - fseDecoderPool.Put(f) - } - if f := h.decoders.matchLengths.fse; f != nil && !f.preDefined { - fseDecoderPool.Put(f) - } - h.decoders = sequenceDecs{} + h.decoders.freeDecoders() + h.decoders = sequenceDecs{br: h.decoders.br} + h.freeHuffDecoder() + h.huffTree = nil + h.dict = nil + //printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b)) +} + +func (h *history) freeHuffDecoder() { if h.huffTree != nil { if h.dict == nil || h.dict.litEnc != h.huffTree { huffDecoderPool.Put(h.huffTree) + h.huffTree = nil } } - h.huffTree = nil - h.dict = nil - //printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b)) } func (h *history) setDict(dict *dict) { @@ -54,6 +62,7 @@ func (h *history) setDict(dict *dict) { h.decoders.litLengths = dict.llDec h.decoders.offsets = dict.ofDec h.decoders.matchLengths = dict.mlDec + h.decoders.dict = dict.content h.recentOffsets = dict.offsets h.huffTree = dict.litEnc } @@ -83,6 +92,24 @@ func (h *history) append(b []byte) { copy(h.b[h.windowSize-len(b):], b) } +// ensureBlock will ensure there is space for at least one block... +func (h *history) ensureBlock() { + if cap(h.b) < h.allocFrameBuffer { + h.b = make([]byte, 0, h.allocFrameBuffer) + return + } + + avail := cap(h.b) - len(h.b) + if avail >= h.windowSize || avail > maxCompressedBlockSize { + return + } + // Move data down so we only have window size left. + // We know we have less than window size in b at this point. + discard := len(h.b) - h.windowSize + copy(h.b, h.b[discard:]) + h.b = h.b[:h.windowSize] +} + // append bytes to history without ever discarding anything. func (h *history) appendKeep(b []byte) { h.b = append(h.b, b...) diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s index be8db5bf7960..cea178561970 100644 --- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s +++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s @@ -1,6 +1,7 @@ // +build !appengine // +build gc // +build !purego +// +build !noasm #include "textflag.h" diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s index 662609589079..4d64a17d69c1 100644 --- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s +++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s @@ -1,13 +1,13 @@ -// +build gc,!purego +// +build gc,!purego,!noasm #include "textflag.h" // Register allocation. #define digest R1 -#define h R2 // Return value. -#define p R3 // Input pointer. +#define h R2 // Return value. +#define p R3 // Input pointer. #define len R4 -#define nblocks R5 // len / 32. +#define nblocks R5 // len / 32. #define prime1 R7 #define prime2 R8 #define prime3 R9 @@ -22,50 +22,48 @@ #define x3 R22 #define x4 R23 -#define round(acc, x) \ - MADD prime2, acc, x, acc \ - ROR $64-31, acc \ - MUL prime1, acc \ +#define round(acc, x) \ + MADD prime2, acc, x, acc \ + ROR $64-31, acc \ + MUL prime1, acc \ // x = round(0, x). -#define round0(x) \ - MUL prime2, x \ - ROR $64-31, x \ - MUL prime1, x \ +#define round0(x) \ + MUL prime2, x \ + ROR $64-31, x \ + MUL prime1, x \ -#define mergeRound(x) \ - round0(x) \ - EOR x, h \ - MADD h, prime4, prime1, h \ +#define mergeRound(x) \ + round0(x) \ + EOR x, h \ + MADD h, prime4, prime1, h \ // Update v[1-4] with 32-byte blocks. Assumes len >= 32. -#define blocksLoop() \ - LSR $5, len, nblocks \ - PCALIGN $16 \ -loop: \ - LDP.P 32(p), (x1, x2) \ - round(v1, x1) \ - LDP -16(p), (x3, x4) \ - round(v2, x2) \ - SUB $1, nblocks \ - round(v3, x3) \ - round(v4, x4) \ - CBNZ nblocks, loop \ - +#define blocksLoop() \ + LSR $5, len, nblocks \ + PCALIGN $16 \ + loop: \ + LDP.P 32(p), (x1, x2) \ + round(v1, x1) \ + LDP -16(p), (x3, x4) \ + round(v2, x2) \ + SUB $1, nblocks \ + round(v3, x3) \ + round(v4, x4) \ + CBNZ nblocks, loop \ // The primes are repeated here to ensure that they're stored // in a contiguous array, so we can load them with LDP. -DATA primes<> +0(SB)/8, $11400714785074694791 -DATA primes<> +8(SB)/8, $14029467366897019727 -DATA primes<>+16(SB)/8, $1609587929392839161 -DATA primes<>+24(SB)/8, $9650029242287828579 -DATA primes<>+32(SB)/8, $2870177450012600261 +DATA primes<> +0(SB)/8, $11400714785074694791 +DATA primes<> +8(SB)/8, $14029467366897019727 +DATA primes<>+16(SB)/8, $1609587929392839161 +DATA primes<>+24(SB)/8, $9650029242287828579 +DATA primes<>+32(SB)/8, $2870177450012600261 GLOBL primes<>(SB), NOPTR+RODATA, $40 - // func Sum64(b []byte) uint64 TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32 - LDP b_base+0(FP), (p, len) + LDP b_base+0(FP), (p, len) LDP primes<> +0(SB), (prime1, prime2) LDP primes<>+16(SB), (prime3, prime4) @@ -156,24 +154,23 @@ try1: end: EOR h >> 33, h - MUL prime2, h + MUL prime2, h EOR h >> 29, h - MUL prime3, h + MUL prime3, h EOR h >> 32, h MOVD h, ret+24(FP) RET - // func writeBlocks(d *Digest, b []byte) int // // Assumes len(b) >= 32. TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40 - LDP primes<>(SB), (prime1, prime2) + LDP primes<>(SB), (prime1, prime2) // Load state. Assume v[1-4] are stored contiguously. MOVD d+0(FP), digest - LDP 0(digest), (v1, v2) + LDP 0(digest), (v1, v2) LDP 16(digest), (v3, v4) LDP b_base+8(FP), (p, len) @@ -181,7 +178,7 @@ TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40 blocksLoop() // Store updated state. - STP (v1, v2), 0(digest) + STP (v1, v2), 0(digest) STP (v3, v4), 16(digest) BIC $31, len diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go index 9216e0a40c1a..1a1fac9c2613 100644 --- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go +++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go @@ -1,8 +1,9 @@ -//go:build (amd64 || arm64) && !appengine && gc && !purego +//go:build (amd64 || arm64) && !appengine && gc && !purego && !noasm // +build amd64 arm64 // +build !appengine // +build gc // +build !purego +// +build !noasm package xxhash diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go index 2deb1ca75532..209cb4a999c3 100644 --- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go +++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go @@ -1,5 +1,5 @@ -//go:build (!amd64 && !arm64) || appengine || !gc || purego -// +build !amd64,!arm64 appengine !gc purego +//go:build (!amd64 && !arm64) || appengine || !gc || purego || noasm +// +build !amd64,!arm64 appengine !gc purego noasm package xxhash diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go index bc731e4cb69a..f833d1541f98 100644 --- a/vendor/github.com/klauspost/compress/zstd/seqdec.go +++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go @@ -20,6 +20,10 @@ type seq struct { llCode, mlCode, ofCode uint8 } +type seqVals struct { + ll, ml, mo int +} + func (s seq) String() string { if s.offset <= 3 { if s.offset == 0 { @@ -61,16 +65,19 @@ type sequenceDecs struct { offsets sequenceDec matchLengths sequenceDec prevOffset [3]int - hist []byte dict []byte literals []byte out []byte + nSeqs int + br *bitReader + seqSize int windowSize int maxBits uint8 + maxSyncLen uint64 } // initialize all 3 decoders from the stream input. -func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []byte) error { +func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) error { if err := s.litLengths.init(br); err != nil { return errors.New("litLengths:" + err.Error()) } @@ -80,8 +87,7 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out [] if err := s.matchLengths.init(br); err != nil { return errors.New("matchLengths:" + err.Error()) } - s.literals = literals - s.hist = hist.b + s.br = br s.prevOffset = hist.recentOffsets s.maxBits = s.litLengths.fse.maxBits + s.offsets.fse.maxBits + s.matchLengths.fse.maxBits s.windowSize = hist.windowSize @@ -93,12 +99,142 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out [] return nil } +func (s *sequenceDecs) freeDecoders() { + if f := s.litLengths.fse; f != nil && !f.preDefined { + fseDecoderPool.Put(f) + s.litLengths.fse = nil + } + if f := s.offsets.fse; f != nil && !f.preDefined { + fseDecoderPool.Put(f) + s.offsets.fse = nil + } + if f := s.matchLengths.fse; f != nil && !f.preDefined { + fseDecoderPool.Put(f) + s.matchLengths.fse = nil + } +} + +// execute will execute the decoded sequence with the provided history. +// The sequence must be evaluated before being sent. +func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error { + if len(s.dict) == 0 { + return s.executeSimple(seqs, hist) + } + + // Ensure we have enough output size... + if len(s.out)+s.seqSize > cap(s.out) { + addBytes := s.seqSize + len(s.out) + s.out = append(s.out, make([]byte, addBytes)...) + s.out = s.out[:len(s.out)-addBytes] + } + + if debugDecoder { + printf("Execute %d seqs with hist %d, dict %d, literals: %d into %d bytes\n", len(seqs), len(hist), len(s.dict), len(s.literals), s.seqSize) + } + + var t = len(s.out) + out := s.out[:t+s.seqSize] + + for _, seq := range seqs { + // Add literals + copy(out[t:], s.literals[:seq.ll]) + t += seq.ll + s.literals = s.literals[seq.ll:] + + // Copy from dictionary... + if seq.mo > t+len(hist) || seq.mo > s.windowSize { + if len(s.dict) == 0 { + return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist)) + } + + // we may be in dictionary. + dictO := len(s.dict) - (seq.mo - (t + len(hist))) + if dictO < 0 || dictO >= len(s.dict) { + return fmt.Errorf("match offset (%d) bigger than current history+dict (%d)", seq.mo, t+len(hist)+len(s.dict)) + } + end := dictO + seq.ml + if end > len(s.dict) { + n := len(s.dict) - dictO + copy(out[t:], s.dict[dictO:]) + t += n + seq.ml -= n + } else { + copy(out[t:], s.dict[dictO:end]) + t += end - dictO + continue + } + } + + // Copy from history. + if v := seq.mo - t; v > 0 { + // v is the start position in history from end. + start := len(hist) - v + if seq.ml > v { + // Some goes into current block. + // Copy remainder of history + copy(out[t:], hist[start:]) + t += v + seq.ml -= v + } else { + copy(out[t:], hist[start:start+seq.ml]) + t += seq.ml + continue + } + } + // We must be in current buffer now + if seq.ml > 0 { + start := t - seq.mo + if seq.ml <= t-start { + // No overlap + copy(out[t:], out[start:start+seq.ml]) + t += seq.ml + continue + } else { + // Overlapping copy + // Extend destination slice and copy one byte at the time. + src := out[start : start+seq.ml] + dst := out[t:] + dst = dst[:len(src)] + t += len(src) + // Destination is the space we just added. + for i := range src { + dst[i] = src[i] + } + } + } + } + + // Add final literals + copy(out[t:], s.literals) + if debugDecoder { + t += len(s.literals) + if t != len(out) { + panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize)) + } + } + s.out = out + + return nil +} + // decode sequences from the stream with the provided history. -func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error { +func (s *sequenceDecs) decodeSync(hist []byte) error { + supported, err := s.decodeSyncSimple(hist) + if supported { + return err + } + + br := s.br + seqs := s.nSeqs startSize := len(s.out) // Grab full sizes tables, to avoid bounds checks. llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize] llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state + out := s.out + maxBlockSize := maxCompressedBlockSize + if s.windowSize < maxBlockSize { + maxBlockSize = s.windowSize + } for i := seqs - 1; i >= 0; i-- { if br.overread() { @@ -151,7 +287,7 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error { if temp == 0 { // 0 is not valid; input is corrupted; force offset to 1 - println("temp was 0") + println("WARNING: temp was 0") temp = 1 } @@ -176,51 +312,52 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error { if ll > len(s.literals) { return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, len(s.literals)) } - size := ll + ml + len(s.out) + size := ll + ml + len(out) if size-startSize > maxBlockSize { - return fmt.Errorf("output (%d) bigger than max block size", size) + if size-startSize == 424242 { + panic("here") + } + return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) } - if size > cap(s.out) { + if size > cap(out) { // Not enough size, which can happen under high volume block streaming conditions // but could be if destination slice is too small for sync operations. // over-allocating here can create a large amount of GC pressure so we try to keep // it as contained as possible - used := len(s.out) - startSize + used := len(out) - startSize addBytes := 256 + ll + ml + used>>2 // Clamp to max block size. if used+addBytes > maxBlockSize { addBytes = maxBlockSize - used } - s.out = append(s.out, make([]byte, addBytes)...) - s.out = s.out[:len(s.out)-addBytes] + out = append(out, make([]byte, addBytes)...) + out = out[:len(out)-addBytes] } if ml > maxMatchLen { return fmt.Errorf("match len (%d) bigger than max allowed length", ml) } // Add literals - s.out = append(s.out, s.literals[:ll]...) + out = append(out, s.literals[:ll]...) s.literals = s.literals[ll:] - out := s.out if mo == 0 && ml > 0 { return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml) } - if mo > len(s.out)+len(hist) || mo > s.windowSize { + if mo > len(out)+len(hist) || mo > s.windowSize { if len(s.dict) == 0 { - return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist)) + return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize) } // we may be in dictionary. - dictO := len(s.dict) - (mo - (len(s.out) + len(hist))) + dictO := len(s.dict) - (mo - (len(out) + len(hist))) if dictO < 0 || dictO >= len(s.dict) { - return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist)) + return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize) } end := dictO + ml if end > len(s.dict) { out = append(out, s.dict[dictO:]...) - mo -= len(s.dict) - dictO ml -= len(s.dict) - dictO } else { out = append(out, s.dict[dictO:end]...) @@ -231,26 +368,25 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error { // Copy from history. // TODO: Blocks without history could be made to ignore this completely. - if v := mo - len(s.out); v > 0 { + if v := mo - len(out); v > 0 { // v is the start position in history from end. - start := len(s.hist) - v + start := len(hist) - v if ml > v { // Some goes into current block. // Copy remainder of history - out = append(out, s.hist[start:]...) - mo -= v + out = append(out, hist[start:]...) ml -= v } else { - out = append(out, s.hist[start:start+ml]...) + out = append(out, hist[start:start+ml]...) ml = 0 } } // We must be in current buffer now if ml > 0 { - start := len(s.out) - mo - if ml <= len(s.out)-start { + start := len(out) - mo + if ml <= len(out)-start { // No overlap - out = append(out, s.out[start:start+ml]...) + out = append(out, out[start:start+ml]...) } else { // Overlapping copy // Extend destination slice and copy one byte at the time. @@ -264,7 +400,6 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error { } } } - s.out = out if i == 0 { // This is the last sequence, so we shouldn't update state. break @@ -279,6 +414,7 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error { ofState = ofTable[ofState.newState()&maxTableMask] } else { bits := br.get32BitsFast(nBits) + lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31)) llState = llTable[(llState.newState()+lowBits)&maxTableMask] @@ -291,19 +427,14 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error { } } - // Add final literals - s.out = append(s.out, s.literals...) - return nil -} + // Check if space for literals + if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize { + return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) + } -// update states, at least 27 bits must be available. -func (s *sequenceDecs) update(br *bitReader) { - // Max 8 bits - s.litLengths.state.next(br) - // Max 9 bits - s.matchLengths.state.next(br) - // Max 8 bits - s.offsets.state.next(br) + // Add final literals + s.out = append(out, s.literals...) + return br.close() } var bitMask [16]uint16 @@ -314,87 +445,6 @@ func init() { } } -// update states, at least 27 bits must be available. -func (s *sequenceDecs) updateAlt(br *bitReader) { - // Update all 3 states at once. Approx 20% faster. - a, b, c := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state - - nBits := a.nbBits() + b.nbBits() + c.nbBits() - if nBits == 0 { - s.litLengths.state.state = s.litLengths.state.dt[a.newState()] - s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()] - s.offsets.state.state = s.offsets.state.dt[c.newState()] - return - } - bits := br.get32BitsFast(nBits) - lowBits := uint16(bits >> ((c.nbBits() + b.nbBits()) & 31)) - s.litLengths.state.state = s.litLengths.state.dt[a.newState()+lowBits] - - lowBits = uint16(bits >> (c.nbBits() & 31)) - lowBits &= bitMask[b.nbBits()&15] - s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()+lowBits] - - lowBits = uint16(bits) & bitMask[c.nbBits()&15] - s.offsets.state.state = s.offsets.state.dt[c.newState()+lowBits] -} - -// nextFast will return new states when there are at least 4 unused bytes left on the stream when done. -func (s *sequenceDecs) nextFast(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) { - // Final will not read from stream. - ll, llB := llState.final() - ml, mlB := mlState.final() - mo, moB := ofState.final() - - // extra bits are stored in reverse order. - br.fillFast() - mo += br.getBits(moB) - if s.maxBits > 32 { - br.fillFast() - } - ml += br.getBits(mlB) - ll += br.getBits(llB) - - if moB > 1 { - s.prevOffset[2] = s.prevOffset[1] - s.prevOffset[1] = s.prevOffset[0] - s.prevOffset[0] = mo - return - } - // mo = s.adjustOffset(mo, ll, moB) - // Inlined for rather big speedup - if ll == 0 { - // There is an exception though, when current sequence's literals_length = 0. - // In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2, - // an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte. - mo++ - } - - if mo == 0 { - mo = s.prevOffset[0] - return - } - var temp int - if mo == 3 { - temp = s.prevOffset[0] - 1 - } else { - temp = s.prevOffset[mo] - } - - if temp == 0 { - // 0 is not valid; input is corrupted; force offset to 1 - println("temp was 0") - temp = 1 - } - - if mo != 1 { - s.prevOffset[2] = s.prevOffset[1] - } - s.prevOffset[1] = s.prevOffset[0] - s.prevOffset[0] = temp - mo = temp - return -} - func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) { // Final will not read from stream. ll, llB := llState.final() @@ -457,36 +507,3 @@ func (s *sequenceDecs) adjustOffset(offset, litLen int, offsetB uint8) int { s.prevOffset[0] = temp return temp } - -// mergeHistory will merge history. -func (s *sequenceDecs) mergeHistory(hist *sequenceDecs) (*sequenceDecs, error) { - for i := uint(0); i < 3; i++ { - var sNew, sHist *sequenceDec - switch i { - default: - // same as "case 0": - sNew = &s.litLengths - sHist = &hist.litLengths - case 1: - sNew = &s.offsets - sHist = &hist.offsets - case 2: - sNew = &s.matchLengths - sHist = &hist.matchLengths - } - if sNew.repeat { - if sHist.fse == nil { - return nil, fmt.Errorf("sequence stream %d, repeat requested, but no history", i) - } - continue - } - if sNew.fse == nil { - return nil, fmt.Errorf("sequence stream %d, no fse found", i) - } - if sHist.fse != nil && !sHist.fse.preDefined { - fseDecoderPool.Put(sHist.fse) - } - sHist.fse = sNew.fse - } - return hist, nil -} diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go new file mode 100644 index 000000000000..191384adfd06 --- /dev/null +++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go @@ -0,0 +1,379 @@ +//go:build amd64 && !appengine && !noasm && gc +// +build amd64,!appengine,!noasm,gc + +package zstd + +import ( + "fmt" + + "github.com/klauspost/compress/internal/cpuinfo" +) + +type decodeSyncAsmContext struct { + llTable []decSymbol + mlTable []decSymbol + ofTable []decSymbol + llState uint64 + mlState uint64 + ofState uint64 + iteration int + litRemain int + out []byte + outPosition int + literals []byte + litPosition int + history []byte + windowSize int + ll int // set on error (not for all errors, please refer to _generate/gen.go) + ml int // set on error (not for all errors, please refer to _generate/gen.go) + mo int // set on error (not for all errors, please refer to _generate/gen.go) +} + +// sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm. +// +// Please refer to seqdec_generic.go for the reference implementation. +// +//go:noescape +func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int + +// sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions. +// +//go:noescape +func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int + +// sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer. +// +//go:noescape +func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int + +// sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer. +// +//go:noescape +func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int + +// decode sequences from the stream with the provided history but without a dictionary. +func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) { + if len(s.dict) > 0 { + return false, nil + } + if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize { + return false, nil + } + + // FIXME: Using unsafe memory copies leads to rare, random crashes + // with fuzz testing. It is therefore disabled for now. + const useSafe = true + /* + useSafe := false + if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc { + useSafe = true + } + if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) { + useSafe = true + } + if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc { + useSafe = true + } + */ + + br := s.br + + maxBlockSize := maxCompressedBlockSize + if s.windowSize < maxBlockSize { + maxBlockSize = s.windowSize + } + + ctx := decodeSyncAsmContext{ + llTable: s.litLengths.fse.dt[:maxTablesize], + mlTable: s.matchLengths.fse.dt[:maxTablesize], + ofTable: s.offsets.fse.dt[:maxTablesize], + llState: uint64(s.litLengths.state.state), + mlState: uint64(s.matchLengths.state.state), + ofState: uint64(s.offsets.state.state), + iteration: s.nSeqs - 1, + litRemain: len(s.literals), + out: s.out, + outPosition: len(s.out), + literals: s.literals, + windowSize: s.windowSize, + history: hist, + } + + s.seqSize = 0 + startSize := len(s.out) + + var errCode int + if cpuinfo.HasBMI2() { + if useSafe { + errCode = sequenceDecs_decodeSync_safe_bmi2(s, br, &ctx) + } else { + errCode = sequenceDecs_decodeSync_bmi2(s, br, &ctx) + } + } else { + if useSafe { + errCode = sequenceDecs_decodeSync_safe_amd64(s, br, &ctx) + } else { + errCode = sequenceDecs_decodeSync_amd64(s, br, &ctx) + } + } + switch errCode { + case noError: + break + + case errorMatchLenOfsMismatch: + return true, fmt.Errorf("zero matchoff and matchlen (%d) > 0", ctx.ml) + + case errorMatchLenTooBig: + return true, fmt.Errorf("match len (%d) bigger than max allowed length", ctx.ml) + + case errorMatchOffTooBig: + return true, fmt.Errorf("match offset (%d) bigger than current history (%d)", + ctx.mo, ctx.outPosition+len(hist)-startSize) + + case errorNotEnoughLiterals: + return true, fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", + ctx.ll, ctx.litRemain+ctx.ll) + + case errorNotEnoughSpace: + size := ctx.outPosition + ctx.ll + ctx.ml + if debugDecoder { + println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize) + } + return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) + + default: + return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode) + } + + s.seqSize += ctx.litRemain + if s.seqSize > maxBlockSize { + return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) + + } + err := br.close() + if err != nil { + printf("Closing sequences: %v, %+v\n", err, *br) + return true, err + } + + s.literals = s.literals[ctx.litPosition:] + t := ctx.outPosition + s.out = s.out[:t] + + // Add final literals + s.out = append(s.out, s.literals...) + if debugDecoder { + t += len(s.literals) + if t != len(s.out) { + panic(fmt.Errorf("length mismatch, want %d, got %d", len(s.out), t)) + } + } + + return true, nil +} + +// -------------------------------------------------------------------------------- + +type decodeAsmContext struct { + llTable []decSymbol + mlTable []decSymbol + ofTable []decSymbol + llState uint64 + mlState uint64 + ofState uint64 + iteration int + seqs []seqVals + litRemain int +} + +const noError = 0 + +// error reported when mo == 0 && ml > 0 +const errorMatchLenOfsMismatch = 1 + +// error reported when ml > maxMatchLen +const errorMatchLenTooBig = 2 + +// error reported when mo > available history or mo > s.windowSize +const errorMatchOffTooBig = 3 + +// error reported when the sum of literal lengths exeeceds the literal buffer size +const errorNotEnoughLiterals = 4 + +// error reported when capacity of `out` is too small +const errorNotEnoughSpace = 5 + +// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm. +// +// Please refer to seqdec_generic.go for the reference implementation. +// +//go:noescape +func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int + +// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm. +// +// Please refer to seqdec_generic.go for the reference implementation. +// +//go:noescape +func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int + +// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions. +// +//go:noescape +func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int + +// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions. +// +//go:noescape +func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int + +// decode sequences from the stream without the provided history. +func (s *sequenceDecs) decode(seqs []seqVals) error { + br := s.br + + maxBlockSize := maxCompressedBlockSize + if s.windowSize < maxBlockSize { + maxBlockSize = s.windowSize + } + + ctx := decodeAsmContext{ + llTable: s.litLengths.fse.dt[:maxTablesize], + mlTable: s.matchLengths.fse.dt[:maxTablesize], + ofTable: s.offsets.fse.dt[:maxTablesize], + llState: uint64(s.litLengths.state.state), + mlState: uint64(s.matchLengths.state.state), + ofState: uint64(s.offsets.state.state), + seqs: seqs, + iteration: len(seqs) - 1, + litRemain: len(s.literals), + } + + s.seqSize = 0 + lte56bits := s.maxBits+s.offsets.fse.actualTableLog+s.matchLengths.fse.actualTableLog+s.litLengths.fse.actualTableLog <= 56 + var errCode int + if cpuinfo.HasBMI2() { + if lte56bits { + errCode = sequenceDecs_decode_56_bmi2(s, br, &ctx) + } else { + errCode = sequenceDecs_decode_bmi2(s, br, &ctx) + } + } else { + if lte56bits { + errCode = sequenceDecs_decode_56_amd64(s, br, &ctx) + } else { + errCode = sequenceDecs_decode_amd64(s, br, &ctx) + } + } + if errCode != 0 { + i := len(seqs) - ctx.iteration - 1 + switch errCode { + case errorMatchLenOfsMismatch: + ml := ctx.seqs[i].ml + return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml) + + case errorMatchLenTooBig: + ml := ctx.seqs[i].ml + return fmt.Errorf("match len (%d) bigger than max allowed length", ml) + + case errorNotEnoughLiterals: + ll := ctx.seqs[i].ll + return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, ctx.litRemain+ll) + } + + return fmt.Errorf("sequenceDecs_decode_amd64 returned erronous code %d", errCode) + } + + if ctx.litRemain < 0 { + return fmt.Errorf("literal count is too big: total available %d, total requested %d", + len(s.literals), len(s.literals)-ctx.litRemain) + } + + s.seqSize += ctx.litRemain + if s.seqSize > maxBlockSize { + return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) + } + err := br.close() + if err != nil { + printf("Closing sequences: %v, %+v\n", err, *br) + } + return err +} + +// -------------------------------------------------------------------------------- + +type executeAsmContext struct { + seqs []seqVals + seqIndex int + out []byte + history []byte + literals []byte + outPosition int + litPosition int + windowSize int +} + +// sequenceDecs_executeSimple_amd64 implements the main loop of sequenceDecs.executeSimple in x86 asm. +// +// Returns false if a match offset is too big. +// +// Please refer to seqdec_generic.go for the reference implementation. +// +//go:noescape +func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool + +// Same as above, but with safe memcopies +// +//go:noescape +func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool + +// executeSimple handles cases when dictionary is not used. +func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error { + // Ensure we have enough output size... + if len(s.out)+s.seqSize+compressedBlockOverAlloc > cap(s.out) { + addBytes := s.seqSize + len(s.out) + compressedBlockOverAlloc + s.out = append(s.out, make([]byte, addBytes)...) + s.out = s.out[:len(s.out)-addBytes] + } + + if debugDecoder { + printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize) + } + + var t = len(s.out) + out := s.out[:t+s.seqSize] + + ctx := executeAsmContext{ + seqs: seqs, + seqIndex: 0, + out: out, + history: hist, + outPosition: t, + litPosition: 0, + literals: s.literals, + windowSize: s.windowSize, + } + var ok bool + if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc { + ok = sequenceDecs_executeSimple_safe_amd64(&ctx) + } else { + ok = sequenceDecs_executeSimple_amd64(&ctx) + } + if !ok { + return fmt.Errorf("match offset (%d) bigger than current history (%d)", + seqs[ctx.seqIndex].mo, ctx.outPosition+len(hist)) + } + s.literals = s.literals[ctx.litPosition:] + t = ctx.outPosition + + // Add final literals + copy(out[t:], s.literals) + if debugDecoder { + t += len(s.literals) + if t != len(out) { + panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize)) + } + } + s.out = out + + return nil +} diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s new file mode 100644 index 000000000000..52e5703c26c4 --- /dev/null +++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s @@ -0,0 +1,4099 @@ +// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT. + +//go:build !appengine && !noasm && gc && !noasm + +// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int +// Requires: CMOV +TEXT ·sequenceDecs_decode_amd64(SB), $8-32 + MOVQ br+8(FP), AX + MOVQ 32(AX), DX + MOVBQZX 40(AX), BX + MOVQ 24(AX), SI + MOVQ (AX), AX + ADDQ SI, AX + MOVQ AX, (SP) + MOVQ ctx+16(FP), AX + MOVQ 72(AX), DI + MOVQ 80(AX), R8 + MOVQ 88(AX), R9 + MOVQ 104(AX), R10 + MOVQ s+0(FP), AX + MOVQ 144(AX), R11 + MOVQ 152(AX), R12 + MOVQ 160(AX), R13 + +sequenceDecs_decode_amd64_main_loop: + MOVQ (SP), R14 + + // Fill bitreader to have enough for the offset and match length. + CMPQ SI, $0x08 + JL sequenceDecs_decode_amd64_fill_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R14 + MOVQ (R14), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decode_amd64_fill_end + +sequenceDecs_decode_amd64_fill_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decode_amd64_fill_end + CMPQ BX, $0x07 + JLE sequenceDecs_decode_amd64_fill_end + SHLQ $0x08, DX + SUBQ $0x01, R14 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R14), AX + ORQ AX, DX + JMP sequenceDecs_decode_amd64_fill_byte_by_byte + +sequenceDecs_decode_amd64_fill_end: + // Update offset + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_amd64_of_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_amd64_of_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_amd64_of_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_amd64_of_update_zero: + MOVQ AX, 16(R10) + + // Update match length + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_amd64_ml_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_amd64_ml_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_amd64_ml_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_amd64_ml_update_zero: + MOVQ AX, 8(R10) + + // Fill bitreader to have enough for the remaining + CMPQ SI, $0x08 + JL sequenceDecs_decode_amd64_fill_2_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R14 + MOVQ (R14), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decode_amd64_fill_2_end + +sequenceDecs_decode_amd64_fill_2_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decode_amd64_fill_2_end + CMPQ BX, $0x07 + JLE sequenceDecs_decode_amd64_fill_2_end + SHLQ $0x08, DX + SUBQ $0x01, R14 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R14), AX + ORQ AX, DX + JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte + +sequenceDecs_decode_amd64_fill_2_end: + // Update literal length + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_amd64_ll_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_amd64_ll_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_amd64_ll_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_amd64_ll_update_zero: + MOVQ AX, (R10) + + // Fill bitreader for state updates + MOVQ R14, (SP) + MOVQ R9, AX + SHRQ $0x08, AX + MOVBQZX AL, AX + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decode_amd64_skip_update + + // Update Literal Length State + MOVBQZX DI, R14 + SHRQ $0x10, DI + MOVWQZX DI, DI + LEAQ (BX)(R14*1), CX + MOVQ DX, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 + ADDQ R15, DI + + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(DI*8), DI + + // Update Match Length State + MOVBQZX R8, R14 + SHRQ $0x10, R8 + MOVWQZX R8, R8 + LEAQ (BX)(R14*1), CX + MOVQ DX, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 + ADDQ R15, R8 + + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(R8*8), R8 + + // Update Offset State + MOVBQZX R9, R14 + SHRQ $0x10, R9 + MOVWQZX R9, R9 + LEAQ (BX)(R14*1), CX + MOVQ DX, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 + ADDQ R15, R9 + + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R9*8), R9 + +sequenceDecs_decode_amd64_skip_update: + // Adjust offset + MOVQ 16(R10), CX + CMPQ AX, $0x01 + JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0 + MOVQ R12, R13 + MOVQ R11, R12 + MOVQ CX, R11 + JMP sequenceDecs_decode_amd64_after_adjust + +sequenceDecs_decode_amd64_adjust_offsetB_1_or_0: + CMPQ (R10), $0x00000000 + JNE sequenceDecs_decode_amd64_adjust_offset_maybezero + INCQ CX + JMP sequenceDecs_decode_amd64_adjust_offset_nonzero + +sequenceDecs_decode_amd64_adjust_offset_maybezero: + TESTQ CX, CX + JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero + MOVQ R11, CX + JMP sequenceDecs_decode_amd64_after_adjust + +sequenceDecs_decode_amd64_adjust_offset_nonzero: + CMPQ CX, $0x01 + JB sequenceDecs_decode_amd64_adjust_zero + JEQ sequenceDecs_decode_amd64_adjust_one + CMPQ CX, $0x02 + JA sequenceDecs_decode_amd64_adjust_three + JMP sequenceDecs_decode_amd64_adjust_two + +sequenceDecs_decode_amd64_adjust_zero: + MOVQ R11, AX + JMP sequenceDecs_decode_amd64_adjust_test_temp_valid + +sequenceDecs_decode_amd64_adjust_one: + MOVQ R12, AX + JMP sequenceDecs_decode_amd64_adjust_test_temp_valid + +sequenceDecs_decode_amd64_adjust_two: + MOVQ R13, AX + JMP sequenceDecs_decode_amd64_adjust_test_temp_valid + +sequenceDecs_decode_amd64_adjust_three: + LEAQ -1(R11), AX + +sequenceDecs_decode_amd64_adjust_test_temp_valid: + TESTQ AX, AX + JNZ sequenceDecs_decode_amd64_adjust_temp_valid + MOVQ $0x00000001, AX + +sequenceDecs_decode_amd64_adjust_temp_valid: + CMPQ CX, $0x01 + CMOVQNE R12, R13 + MOVQ R11, R12 + MOVQ AX, R11 + MOVQ AX, CX + +sequenceDecs_decode_amd64_after_adjust: + MOVQ CX, 16(R10) + + // Check values + MOVQ 8(R10), AX + MOVQ (R10), R14 + LEAQ (AX)(R14*1), R15 + MOVQ s+0(FP), BP + ADDQ R15, 256(BP) + MOVQ ctx+16(FP), R15 + SUBQ R14, 128(R15) + JS error_not_enough_literals + CMPQ AX, $0x00020002 + JA sequenceDecs_decode_amd64_error_match_len_too_big + TESTQ CX, CX + JNZ sequenceDecs_decode_amd64_match_len_ofs_ok + TESTQ AX, AX + JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch + +sequenceDecs_decode_amd64_match_len_ofs_ok: + ADDQ $0x18, R10 + MOVQ ctx+16(FP), AX + DECQ 96(AX) + JNS sequenceDecs_decode_amd64_main_loop + MOVQ s+0(FP), AX + MOVQ R11, 144(AX) + MOVQ R12, 152(AX) + MOVQ R13, 160(AX) + MOVQ br+8(FP), AX + MOVQ DX, 32(AX) + MOVB BL, 40(AX) + MOVQ SI, 24(AX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decode_amd64_error_match_len_ofs_mismatch: + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decode_amd64_error_match_len_too_big: + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error + MOVQ $0x00000005, ret+24(FP) + RET + +// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int +// Requires: CMOV +TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32 + MOVQ br+8(FP), AX + MOVQ 32(AX), DX + MOVBQZX 40(AX), BX + MOVQ 24(AX), SI + MOVQ (AX), AX + ADDQ SI, AX + MOVQ AX, (SP) + MOVQ ctx+16(FP), AX + MOVQ 72(AX), DI + MOVQ 80(AX), R8 + MOVQ 88(AX), R9 + MOVQ 104(AX), R10 + MOVQ s+0(FP), AX + MOVQ 144(AX), R11 + MOVQ 152(AX), R12 + MOVQ 160(AX), R13 + +sequenceDecs_decode_56_amd64_main_loop: + MOVQ (SP), R14 + + // Fill bitreader to have enough for the offset and match length. + CMPQ SI, $0x08 + JL sequenceDecs_decode_56_amd64_fill_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R14 + MOVQ (R14), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decode_56_amd64_fill_end + +sequenceDecs_decode_56_amd64_fill_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decode_56_amd64_fill_end + CMPQ BX, $0x07 + JLE sequenceDecs_decode_56_amd64_fill_end + SHLQ $0x08, DX + SUBQ $0x01, R14 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R14), AX + ORQ AX, DX + JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte + +sequenceDecs_decode_56_amd64_fill_end: + // Update offset + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_56_amd64_of_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_56_amd64_of_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_56_amd64_of_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_56_amd64_of_update_zero: + MOVQ AX, 16(R10) + + // Update match length + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_56_amd64_ml_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_56_amd64_ml_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_56_amd64_ml_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_56_amd64_ml_update_zero: + MOVQ AX, 8(R10) + + // Update literal length + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R15 + SHLQ CL, R15 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decode_56_amd64_ll_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decode_56_amd64_ll_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decode_56_amd64_ll_update_zero + NEGQ CX + SHRQ CL, R15 + ADDQ R15, AX + +sequenceDecs_decode_56_amd64_ll_update_zero: + MOVQ AX, (R10) + + // Fill bitreader for state updates + MOVQ R14, (SP) + MOVQ R9, AX + SHRQ $0x08, AX + MOVBQZX AL, AX + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decode_56_amd64_skip_update + + // Update Literal Length State + MOVBQZX DI, R14 + SHRQ $0x10, DI + MOVWQZX DI, DI + LEAQ (BX)(R14*1), CX + MOVQ DX, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 + ADDQ R15, DI + + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(DI*8), DI + + // Update Match Length State + MOVBQZX R8, R14 + SHRQ $0x10, R8 + MOVWQZX R8, R8 + LEAQ (BX)(R14*1), CX + MOVQ DX, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 + ADDQ R15, R8 + + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(R8*8), R8 + + // Update Offset State + MOVBQZX R9, R14 + SHRQ $0x10, R9 + MOVWQZX R9, R9 + LEAQ (BX)(R14*1), CX + MOVQ DX, R15 + MOVQ CX, BX + ROLQ CL, R15 + MOVL $0x00000001, BP + MOVB R14, CL + SHLL CL, BP + DECL BP + ANDQ BP, R15 + ADDQ R15, R9 + + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R9*8), R9 + +sequenceDecs_decode_56_amd64_skip_update: + // Adjust offset + MOVQ 16(R10), CX + CMPQ AX, $0x01 + JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0 + MOVQ R12, R13 + MOVQ R11, R12 + MOVQ CX, R11 + JMP sequenceDecs_decode_56_amd64_after_adjust + +sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0: + CMPQ (R10), $0x00000000 + JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero + INCQ CX + JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero + +sequenceDecs_decode_56_amd64_adjust_offset_maybezero: + TESTQ CX, CX + JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero + MOVQ R11, CX + JMP sequenceDecs_decode_56_amd64_after_adjust + +sequenceDecs_decode_56_amd64_adjust_offset_nonzero: + CMPQ CX, $0x01 + JB sequenceDecs_decode_56_amd64_adjust_zero + JEQ sequenceDecs_decode_56_amd64_adjust_one + CMPQ CX, $0x02 + JA sequenceDecs_decode_56_amd64_adjust_three + JMP sequenceDecs_decode_56_amd64_adjust_two + +sequenceDecs_decode_56_amd64_adjust_zero: + MOVQ R11, AX + JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid + +sequenceDecs_decode_56_amd64_adjust_one: + MOVQ R12, AX + JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid + +sequenceDecs_decode_56_amd64_adjust_two: + MOVQ R13, AX + JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid + +sequenceDecs_decode_56_amd64_adjust_three: + LEAQ -1(R11), AX + +sequenceDecs_decode_56_amd64_adjust_test_temp_valid: + TESTQ AX, AX + JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid + MOVQ $0x00000001, AX + +sequenceDecs_decode_56_amd64_adjust_temp_valid: + CMPQ CX, $0x01 + CMOVQNE R12, R13 + MOVQ R11, R12 + MOVQ AX, R11 + MOVQ AX, CX + +sequenceDecs_decode_56_amd64_after_adjust: + MOVQ CX, 16(R10) + + // Check values + MOVQ 8(R10), AX + MOVQ (R10), R14 + LEAQ (AX)(R14*1), R15 + MOVQ s+0(FP), BP + ADDQ R15, 256(BP) + MOVQ ctx+16(FP), R15 + SUBQ R14, 128(R15) + JS error_not_enough_literals + CMPQ AX, $0x00020002 + JA sequenceDecs_decode_56_amd64_error_match_len_too_big + TESTQ CX, CX + JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok + TESTQ AX, AX + JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch + +sequenceDecs_decode_56_amd64_match_len_ofs_ok: + ADDQ $0x18, R10 + MOVQ ctx+16(FP), AX + DECQ 96(AX) + JNS sequenceDecs_decode_56_amd64_main_loop + MOVQ s+0(FP), AX + MOVQ R11, 144(AX) + MOVQ R12, 152(AX) + MOVQ R13, 160(AX) + MOVQ br+8(FP), AX + MOVQ DX, 32(AX) + MOVB BL, 40(AX) + MOVQ SI, 24(AX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch: + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decode_56_amd64_error_match_len_too_big: + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error + MOVQ $0x00000005, ret+24(FP) + RET + +// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int +// Requires: BMI, BMI2, CMOV +TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 + MOVQ br+8(FP), CX + MOVQ 32(CX), AX + MOVBQZX 40(CX), DX + MOVQ 24(CX), BX + MOVQ (CX), CX + ADDQ BX, CX + MOVQ CX, (SP) + MOVQ ctx+16(FP), CX + MOVQ 72(CX), SI + MOVQ 80(CX), DI + MOVQ 88(CX), R8 + MOVQ 104(CX), R9 + MOVQ s+0(FP), CX + MOVQ 144(CX), R10 + MOVQ 152(CX), R11 + MOVQ 160(CX), R12 + +sequenceDecs_decode_bmi2_main_loop: + MOVQ (SP), R13 + + // Fill bitreader to have enough for the offset and match length. + CMPQ BX, $0x08 + JL sequenceDecs_decode_bmi2_fill_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R13 + MOVQ (R13), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decode_bmi2_fill_end + +sequenceDecs_decode_bmi2_fill_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decode_bmi2_fill_end + CMPQ DX, $0x07 + JLE sequenceDecs_decode_bmi2_fill_end + SHLQ $0x08, AX + SUBQ $0x01, R13 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R13), CX + ORQ CX, AX + JMP sequenceDecs_decode_bmi2_fill_byte_by_byte + +sequenceDecs_decode_bmi2_fill_end: + // Update offset + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R14 + MOVQ AX, R15 + LEAQ (DX)(R14*1), CX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + MOVQ CX, DX + MOVQ R8, CX + SHRQ $0x20, CX + ADDQ R15, CX + MOVQ CX, 16(R9) + + // Update match length + MOVQ $0x00000808, CX + BEXTRQ CX, DI, R14 + MOVQ AX, R15 + LEAQ (DX)(R14*1), CX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + MOVQ CX, DX + MOVQ DI, CX + SHRQ $0x20, CX + ADDQ R15, CX + MOVQ CX, 8(R9) + + // Fill bitreader to have enough for the remaining + CMPQ BX, $0x08 + JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R13 + MOVQ (R13), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decode_bmi2_fill_2_end + +sequenceDecs_decode_bmi2_fill_2_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decode_bmi2_fill_2_end + CMPQ DX, $0x07 + JLE sequenceDecs_decode_bmi2_fill_2_end + SHLQ $0x08, AX + SUBQ $0x01, R13 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R13), CX + ORQ CX, AX + JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte + +sequenceDecs_decode_bmi2_fill_2_end: + // Update literal length + MOVQ $0x00000808, CX + BEXTRQ CX, SI, R14 + MOVQ AX, R15 + LEAQ (DX)(R14*1), CX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + MOVQ CX, DX + MOVQ SI, CX + SHRQ $0x20, CX + ADDQ R15, CX + MOVQ CX, (R9) + + // Fill bitreader for state updates + MOVQ R13, (SP) + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R13 + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decode_bmi2_skip_update + LEAQ (SI)(DI*1), R14 + ADDQ R8, R14 + MOVBQZX R14, R14 + LEAQ (DX)(R14*1), CX + MOVQ AX, R15 + MOVQ CX, DX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + + // Update Offset State + BZHIQ R8, R15, CX + SHRXQ R8, R15, R15 + MOVQ $0x00001010, R14 + BEXTRQ R14, R8, R8 + ADDQ CX, R8 + + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R8*8), R8 + + // Update Match Length State + BZHIQ DI, R15, CX + SHRXQ DI, R15, R15 + MOVQ $0x00001010, R14 + BEXTRQ R14, DI, DI + ADDQ CX, DI + + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(DI*8), DI + + // Update Literal Length State + BZHIQ SI, R15, CX + MOVQ $0x00001010, R14 + BEXTRQ R14, SI, SI + ADDQ CX, SI + + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(SI*8), SI + +sequenceDecs_decode_bmi2_skip_update: + // Adjust offset + MOVQ 16(R9), CX + CMPQ R13, $0x01 + JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0 + MOVQ R11, R12 + MOVQ R10, R11 + MOVQ CX, R10 + JMP sequenceDecs_decode_bmi2_after_adjust + +sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0: + CMPQ (R9), $0x00000000 + JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero + INCQ CX + JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero + +sequenceDecs_decode_bmi2_adjust_offset_maybezero: + TESTQ CX, CX + JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero + MOVQ R10, CX + JMP sequenceDecs_decode_bmi2_after_adjust + +sequenceDecs_decode_bmi2_adjust_offset_nonzero: + CMPQ CX, $0x01 + JB sequenceDecs_decode_bmi2_adjust_zero + JEQ sequenceDecs_decode_bmi2_adjust_one + CMPQ CX, $0x02 + JA sequenceDecs_decode_bmi2_adjust_three + JMP sequenceDecs_decode_bmi2_adjust_two + +sequenceDecs_decode_bmi2_adjust_zero: + MOVQ R10, R13 + JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid + +sequenceDecs_decode_bmi2_adjust_one: + MOVQ R11, R13 + JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid + +sequenceDecs_decode_bmi2_adjust_two: + MOVQ R12, R13 + JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid + +sequenceDecs_decode_bmi2_adjust_three: + LEAQ -1(R10), R13 + +sequenceDecs_decode_bmi2_adjust_test_temp_valid: + TESTQ R13, R13 + JNZ sequenceDecs_decode_bmi2_adjust_temp_valid + MOVQ $0x00000001, R13 + +sequenceDecs_decode_bmi2_adjust_temp_valid: + CMPQ CX, $0x01 + CMOVQNE R11, R12 + MOVQ R10, R11 + MOVQ R13, R10 + MOVQ R13, CX + +sequenceDecs_decode_bmi2_after_adjust: + MOVQ CX, 16(R9) + + // Check values + MOVQ 8(R9), R13 + MOVQ (R9), R14 + LEAQ (R13)(R14*1), R15 + MOVQ s+0(FP), BP + ADDQ R15, 256(BP) + MOVQ ctx+16(FP), R15 + SUBQ R14, 128(R15) + JS error_not_enough_literals + CMPQ R13, $0x00020002 + JA sequenceDecs_decode_bmi2_error_match_len_too_big + TESTQ CX, CX + JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok + TESTQ R13, R13 + JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch + +sequenceDecs_decode_bmi2_match_len_ofs_ok: + ADDQ $0x18, R9 + MOVQ ctx+16(FP), CX + DECQ 96(CX) + JNS sequenceDecs_decode_bmi2_main_loop + MOVQ s+0(FP), CX + MOVQ R10, 144(CX) + MOVQ R11, 152(CX) + MOVQ R12, 160(CX) + MOVQ br+8(FP), CX + MOVQ AX, 32(CX) + MOVB DL, 40(CX) + MOVQ BX, 24(CX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch: + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decode_bmi2_error_match_len_too_big: + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error + MOVQ $0x00000005, ret+24(FP) + RET + +// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int +// Requires: BMI, BMI2, CMOV +TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32 + MOVQ br+8(FP), CX + MOVQ 32(CX), AX + MOVBQZX 40(CX), DX + MOVQ 24(CX), BX + MOVQ (CX), CX + ADDQ BX, CX + MOVQ CX, (SP) + MOVQ ctx+16(FP), CX + MOVQ 72(CX), SI + MOVQ 80(CX), DI + MOVQ 88(CX), R8 + MOVQ 104(CX), R9 + MOVQ s+0(FP), CX + MOVQ 144(CX), R10 + MOVQ 152(CX), R11 + MOVQ 160(CX), R12 + +sequenceDecs_decode_56_bmi2_main_loop: + MOVQ (SP), R13 + + // Fill bitreader to have enough for the offset and match length. + CMPQ BX, $0x08 + JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R13 + MOVQ (R13), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decode_56_bmi2_fill_end + +sequenceDecs_decode_56_bmi2_fill_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decode_56_bmi2_fill_end + CMPQ DX, $0x07 + JLE sequenceDecs_decode_56_bmi2_fill_end + SHLQ $0x08, AX + SUBQ $0x01, R13 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R13), CX + ORQ CX, AX + JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte + +sequenceDecs_decode_56_bmi2_fill_end: + // Update offset + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R14 + MOVQ AX, R15 + LEAQ (DX)(R14*1), CX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + MOVQ CX, DX + MOVQ R8, CX + SHRQ $0x20, CX + ADDQ R15, CX + MOVQ CX, 16(R9) + + // Update match length + MOVQ $0x00000808, CX + BEXTRQ CX, DI, R14 + MOVQ AX, R15 + LEAQ (DX)(R14*1), CX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + MOVQ CX, DX + MOVQ DI, CX + SHRQ $0x20, CX + ADDQ R15, CX + MOVQ CX, 8(R9) + + // Update literal length + MOVQ $0x00000808, CX + BEXTRQ CX, SI, R14 + MOVQ AX, R15 + LEAQ (DX)(R14*1), CX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + MOVQ CX, DX + MOVQ SI, CX + SHRQ $0x20, CX + ADDQ R15, CX + MOVQ CX, (R9) + + // Fill bitreader for state updates + MOVQ R13, (SP) + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R13 + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decode_56_bmi2_skip_update + LEAQ (SI)(DI*1), R14 + ADDQ R8, R14 + MOVBQZX R14, R14 + LEAQ (DX)(R14*1), CX + MOVQ AX, R15 + MOVQ CX, DX + ROLQ CL, R15 + BZHIQ R14, R15, R15 + + // Update Offset State + BZHIQ R8, R15, CX + SHRXQ R8, R15, R15 + MOVQ $0x00001010, R14 + BEXTRQ R14, R8, R8 + ADDQ CX, R8 + + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R8*8), R8 + + // Update Match Length State + BZHIQ DI, R15, CX + SHRXQ DI, R15, R15 + MOVQ $0x00001010, R14 + BEXTRQ R14, DI, DI + ADDQ CX, DI + + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(DI*8), DI + + // Update Literal Length State + BZHIQ SI, R15, CX + MOVQ $0x00001010, R14 + BEXTRQ R14, SI, SI + ADDQ CX, SI + + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(SI*8), SI + +sequenceDecs_decode_56_bmi2_skip_update: + // Adjust offset + MOVQ 16(R9), CX + CMPQ R13, $0x01 + JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0 + MOVQ R11, R12 + MOVQ R10, R11 + MOVQ CX, R10 + JMP sequenceDecs_decode_56_bmi2_after_adjust + +sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0: + CMPQ (R9), $0x00000000 + JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero + INCQ CX + JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero + +sequenceDecs_decode_56_bmi2_adjust_offset_maybezero: + TESTQ CX, CX + JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero + MOVQ R10, CX + JMP sequenceDecs_decode_56_bmi2_after_adjust + +sequenceDecs_decode_56_bmi2_adjust_offset_nonzero: + CMPQ CX, $0x01 + JB sequenceDecs_decode_56_bmi2_adjust_zero + JEQ sequenceDecs_decode_56_bmi2_adjust_one + CMPQ CX, $0x02 + JA sequenceDecs_decode_56_bmi2_adjust_three + JMP sequenceDecs_decode_56_bmi2_adjust_two + +sequenceDecs_decode_56_bmi2_adjust_zero: + MOVQ R10, R13 + JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid + +sequenceDecs_decode_56_bmi2_adjust_one: + MOVQ R11, R13 + JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid + +sequenceDecs_decode_56_bmi2_adjust_two: + MOVQ R12, R13 + JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid + +sequenceDecs_decode_56_bmi2_adjust_three: + LEAQ -1(R10), R13 + +sequenceDecs_decode_56_bmi2_adjust_test_temp_valid: + TESTQ R13, R13 + JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid + MOVQ $0x00000001, R13 + +sequenceDecs_decode_56_bmi2_adjust_temp_valid: + CMPQ CX, $0x01 + CMOVQNE R11, R12 + MOVQ R10, R11 + MOVQ R13, R10 + MOVQ R13, CX + +sequenceDecs_decode_56_bmi2_after_adjust: + MOVQ CX, 16(R9) + + // Check values + MOVQ 8(R9), R13 + MOVQ (R9), R14 + LEAQ (R13)(R14*1), R15 + MOVQ s+0(FP), BP + ADDQ R15, 256(BP) + MOVQ ctx+16(FP), R15 + SUBQ R14, 128(R15) + JS error_not_enough_literals + CMPQ R13, $0x00020002 + JA sequenceDecs_decode_56_bmi2_error_match_len_too_big + TESTQ CX, CX + JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok + TESTQ R13, R13 + JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch + +sequenceDecs_decode_56_bmi2_match_len_ofs_ok: + ADDQ $0x18, R9 + MOVQ ctx+16(FP), CX + DECQ 96(CX) + JNS sequenceDecs_decode_56_bmi2_main_loop + MOVQ s+0(FP), CX + MOVQ R10, 144(CX) + MOVQ R11, 152(CX) + MOVQ R12, 160(CX) + MOVQ br+8(FP), CX + MOVQ AX, 32(CX) + MOVB DL, 40(CX) + MOVQ BX, 24(CX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch: + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decode_56_bmi2_error_match_len_too_big: + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error + MOVQ $0x00000005, ret+24(FP) + RET + +// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool +// Requires: SSE +TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9 + MOVQ ctx+0(FP), R10 + MOVQ 8(R10), CX + TESTQ CX, CX + JZ empty_seqs + MOVQ (R10), AX + MOVQ 24(R10), DX + MOVQ 32(R10), BX + MOVQ 80(R10), SI + MOVQ 104(R10), DI + MOVQ 120(R10), R8 + MOVQ 56(R10), R9 + MOVQ 64(R10), R10 + ADDQ R10, R9 + + // seqsBase += 24 * seqIndex + LEAQ (DX)(DX*2), R11 + SHLQ $0x03, R11 + ADDQ R11, AX + + // outBase += outPosition + ADDQ DI, BX + +main_loop: + MOVQ (AX), R11 + MOVQ 16(AX), R12 + MOVQ 8(AX), R13 + + // Copy literals + TESTQ R11, R11 + JZ check_offset + XORQ R14, R14 + +copy_1: + MOVUPS (SI)(R14*1), X0 + MOVUPS X0, (BX)(R14*1) + ADDQ $0x10, R14 + CMPQ R14, R11 + JB copy_1 + ADDQ R11, SI + ADDQ R11, BX + ADDQ R11, DI + + // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) +check_offset: + LEAQ (DI)(R10*1), R11 + CMPQ R12, R11 + JG error_match_off_too_big + CMPQ R12, R8 + JG error_match_off_too_big + + // Copy match from history + MOVQ R12, R11 + SUBQ DI, R11 + JLS copy_match + MOVQ R9, R14 + SUBQ R11, R14 + CMPQ R13, R11 + JG copy_all_from_history + MOVQ R13, R11 + SUBQ $0x10, R11 + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R11 + JAE copy_4_loop + LEAQ 16(R14)(R11*1), R14 + LEAQ 16(BX)(R11*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), R11 + MOVB 2(R14), R12 + MOVW R11, (BX) + MOVB R12, 2(BX) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), R11 + MOVL -4(R14)(R13*1), R12 + MOVL R11, (BX) + MOVL R12, -4(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), R11 + MOVQ -8(R14)(R13*1), R12 + MOVQ R11, (BX) + MOVQ R12, -8(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX + +copy_4_end: + ADDQ R13, DI + ADDQ $0x18, AX + INCQ DX + CMPQ DX, CX + JB main_loop + JMP loop_finished + +copy_all_from_history: + MOVQ R11, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(BX)(R15*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_5_end + +copy_5_small: + CMPQ R11, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ R11, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(R11*1), BP + MOVB R15, (BX) + MOVB BP, -1(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (BX) + MOVB BP, 2(BX) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(R11*1), BP + MOVL R15, (BX) + MOVL BP, -4(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(R11*1), BP + MOVQ R15, (BX) + MOVQ BP, -8(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + +copy_5_end: + ADDQ R11, DI + SUBQ R11, R13 + + // Copy match from the current buffer +copy_match: + MOVQ BX, R11 + SUBQ R12, R11 + + // ml <= mo + CMPQ R13, R12 + JA copy_overlapping_match + + // Copy non-overlapping match + ADDQ R13, DI + MOVQ BX, R12 + ADDQ R13, BX + +copy_2: + MOVUPS (R11), X0 + MOVUPS X0, (R12) + ADDQ $0x10, R11 + ADDQ $0x10, R12 + SUBQ $0x10, R13 + JHI copy_2 + JMP handle_loop + + // Copy overlapping match +copy_overlapping_match: + ADDQ R13, DI + +copy_slow_3: + MOVB (R11), R12 + MOVB R12, (BX) + INCQ R11 + INCQ BX + DECQ R13 + JNZ copy_slow_3 + +handle_loop: + ADDQ $0x18, AX + INCQ DX + CMPQ DX, CX + JB main_loop + +loop_finished: + // Return value + MOVB $0x01, ret+8(FP) + + // Update the context + MOVQ ctx+0(FP), AX + MOVQ DX, 24(AX) + MOVQ DI, 104(AX) + MOVQ 80(AX), CX + SUBQ CX, SI + MOVQ SI, 112(AX) + RET + +error_match_off_too_big: + // Return value + MOVB $0x00, ret+8(FP) + + // Update the context + MOVQ ctx+0(FP), AX + MOVQ DX, 24(AX) + MOVQ DI, 104(AX) + MOVQ 80(AX), CX + SUBQ CX, SI + MOVQ SI, 112(AX) + RET + +empty_seqs: + // Return value + MOVB $0x01, ret+8(FP) + RET + +// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool +// Requires: SSE +TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9 + MOVQ ctx+0(FP), R10 + MOVQ 8(R10), CX + TESTQ CX, CX + JZ empty_seqs + MOVQ (R10), AX + MOVQ 24(R10), DX + MOVQ 32(R10), BX + MOVQ 80(R10), SI + MOVQ 104(R10), DI + MOVQ 120(R10), R8 + MOVQ 56(R10), R9 + MOVQ 64(R10), R10 + ADDQ R10, R9 + + // seqsBase += 24 * seqIndex + LEAQ (DX)(DX*2), R11 + SHLQ $0x03, R11 + ADDQ R11, AX + + // outBase += outPosition + ADDQ DI, BX + +main_loop: + MOVQ (AX), R11 + MOVQ 16(AX), R12 + MOVQ 8(AX), R13 + + // Copy literals + TESTQ R11, R11 + JZ check_offset + MOVQ R11, R14 + SUBQ $0x10, R14 + JB copy_1_small + +copy_1_loop: + MOVUPS (SI), X0 + MOVUPS X0, (BX) + ADDQ $0x10, SI + ADDQ $0x10, BX + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(SI)(R14*1), SI + LEAQ 16(BX)(R14*1), BX + MOVUPS -16(SI), X0 + MOVUPS X0, -16(BX) + JMP copy_1_end + +copy_1_small: + CMPQ R11, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ R11, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (SI), R14 + MOVB -1(SI)(R11*1), R15 + MOVB R14, (BX) + MOVB R15, -1(BX)(R11*1) + ADDQ R11, SI + ADDQ R11, BX + JMP copy_1_end + +copy_1_move_3: + MOVW (SI), R14 + MOVB 2(SI), R15 + MOVW R14, (BX) + MOVB R15, 2(BX) + ADDQ R11, SI + ADDQ R11, BX + JMP copy_1_end + +copy_1_move_4through7: + MOVL (SI), R14 + MOVL -4(SI)(R11*1), R15 + MOVL R14, (BX) + MOVL R15, -4(BX)(R11*1) + ADDQ R11, SI + ADDQ R11, BX + JMP copy_1_end + +copy_1_move_8through16: + MOVQ (SI), R14 + MOVQ -8(SI)(R11*1), R15 + MOVQ R14, (BX) + MOVQ R15, -8(BX)(R11*1) + ADDQ R11, SI + ADDQ R11, BX + +copy_1_end: + ADDQ R11, DI + + // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) +check_offset: + LEAQ (DI)(R10*1), R11 + CMPQ R12, R11 + JG error_match_off_too_big + CMPQ R12, R8 + JG error_match_off_too_big + + // Copy match from history + MOVQ R12, R11 + SUBQ DI, R11 + JLS copy_match + MOVQ R9, R14 + SUBQ R11, R14 + CMPQ R13, R11 + JG copy_all_from_history + MOVQ R13, R11 + SUBQ $0x10, R11 + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R11 + JAE copy_4_loop + LEAQ 16(R14)(R11*1), R14 + LEAQ 16(BX)(R11*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), R11 + MOVB 2(R14), R12 + MOVW R11, (BX) + MOVB R12, 2(BX) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), R11 + MOVL -4(R14)(R13*1), R12 + MOVL R11, (BX) + MOVL R12, -4(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), R11 + MOVQ -8(R14)(R13*1), R12 + MOVQ R11, (BX) + MOVQ R12, -8(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX + +copy_4_end: + ADDQ R13, DI + ADDQ $0x18, AX + INCQ DX + CMPQ DX, CX + JB main_loop + JMP loop_finished + +copy_all_from_history: + MOVQ R11, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R14 + ADDQ $0x10, BX + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(BX)(R15*1), BX + MOVUPS -16(R14), X0 + MOVUPS X0, -16(BX) + JMP copy_5_end + +copy_5_small: + CMPQ R11, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ R11, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(R11*1), BP + MOVB R15, (BX) + MOVB BP, -1(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (BX) + MOVB BP, 2(BX) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(R11*1), BP + MOVL R15, (BX) + MOVL BP, -4(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(R11*1), BP + MOVQ R15, (BX) + MOVQ BP, -8(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + +copy_5_end: + ADDQ R11, DI + SUBQ R11, R13 + + // Copy match from the current buffer +copy_match: + MOVQ BX, R11 + SUBQ R12, R11 + + // ml <= mo + CMPQ R13, R12 + JA copy_overlapping_match + + // Copy non-overlapping match + ADDQ R13, DI + MOVQ R13, R12 + SUBQ $0x10, R12 + JB copy_2_small + +copy_2_loop: + MOVUPS (R11), X0 + MOVUPS X0, (BX) + ADDQ $0x10, R11 + ADDQ $0x10, BX + SUBQ $0x10, R12 + JAE copy_2_loop + LEAQ 16(R11)(R12*1), R11 + LEAQ 16(BX)(R12*1), BX + MOVUPS -16(R11), X0 + MOVUPS X0, -16(BX) + JMP copy_2_end + +copy_2_small: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (R11), R12 + MOVB -1(R11)(R13*1), R14 + MOVB R12, (BX) + MOVB R14, -1(BX)(R13*1) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end + +copy_2_move_3: + MOVW (R11), R12 + MOVB 2(R11), R14 + MOVW R12, (BX) + MOVB R14, 2(BX) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end + +copy_2_move_4through7: + MOVL (R11), R12 + MOVL -4(R11)(R13*1), R14 + MOVL R12, (BX) + MOVL R14, -4(BX)(R13*1) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (R11), R12 + MOVQ -8(R11)(R13*1), R14 + MOVQ R12, (BX) + MOVQ R14, -8(BX)(R13*1) + ADDQ R13, R11 + ADDQ R13, BX + +copy_2_end: + JMP handle_loop + + // Copy overlapping match +copy_overlapping_match: + ADDQ R13, DI + +copy_slow_3: + MOVB (R11), R12 + MOVB R12, (BX) + INCQ R11 + INCQ BX + DECQ R13 + JNZ copy_slow_3 + +handle_loop: + ADDQ $0x18, AX + INCQ DX + CMPQ DX, CX + JB main_loop + +loop_finished: + // Return value + MOVB $0x01, ret+8(FP) + + // Update the context + MOVQ ctx+0(FP), AX + MOVQ DX, 24(AX) + MOVQ DI, 104(AX) + MOVQ 80(AX), CX + SUBQ CX, SI + MOVQ SI, 112(AX) + RET + +error_match_off_too_big: + // Return value + MOVB $0x00, ret+8(FP) + + // Update the context + MOVQ ctx+0(FP), AX + MOVQ DX, 24(AX) + MOVQ DI, 104(AX) + MOVQ 80(AX), CX + SUBQ CX, SI + MOVQ SI, 112(AX) + RET + +empty_seqs: + // Return value + MOVB $0x01, ret+8(FP) + RET + +// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int +// Requires: CMOV, SSE +TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32 + MOVQ br+8(FP), AX + MOVQ 32(AX), DX + MOVBQZX 40(AX), BX + MOVQ 24(AX), SI + MOVQ (AX), AX + ADDQ SI, AX + MOVQ AX, (SP) + MOVQ ctx+16(FP), AX + MOVQ 72(AX), DI + MOVQ 80(AX), R8 + MOVQ 88(AX), R9 + XORQ CX, CX + MOVQ CX, 8(SP) + MOVQ CX, 16(SP) + MOVQ CX, 24(SP) + MOVQ 112(AX), R10 + MOVQ 128(AX), CX + MOVQ CX, 32(SP) + MOVQ 144(AX), R11 + MOVQ 136(AX), R12 + MOVQ 200(AX), CX + MOVQ CX, 56(SP) + MOVQ 176(AX), CX + MOVQ CX, 48(SP) + MOVQ 184(AX), AX + MOVQ AX, 40(SP) + MOVQ 40(SP), AX + ADDQ AX, 48(SP) + + // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) + ADDQ R10, 32(SP) + + // outBase += outPosition + ADDQ R12, R10 + +sequenceDecs_decodeSync_amd64_main_loop: + MOVQ (SP), R13 + + // Fill bitreader to have enough for the offset and match length. + CMPQ SI, $0x08 + JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R13 + MOVQ (R13), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decodeSync_amd64_fill_end + +sequenceDecs_decodeSync_amd64_fill_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decodeSync_amd64_fill_end + CMPQ BX, $0x07 + JLE sequenceDecs_decodeSync_amd64_fill_end + SHLQ $0x08, DX + SUBQ $0x01, R13 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R13), AX + ORQ AX, DX + JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte + +sequenceDecs_decodeSync_amd64_fill_end: + // Update offset + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_amd64_of_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_amd64_of_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_amd64_of_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_amd64_of_update_zero: + MOVQ AX, 8(SP) + + // Update match length + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_amd64_ml_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_amd64_ml_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_amd64_ml_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_amd64_ml_update_zero: + MOVQ AX, 16(SP) + + // Fill bitreader to have enough for the remaining + CMPQ SI, $0x08 + JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R13 + MOVQ (R13), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decodeSync_amd64_fill_2_end + +sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decodeSync_amd64_fill_2_end + CMPQ BX, $0x07 + JLE sequenceDecs_decodeSync_amd64_fill_2_end + SHLQ $0x08, DX + SUBQ $0x01, R13 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R13), AX + ORQ AX, DX + JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte + +sequenceDecs_decodeSync_amd64_fill_2_end: + // Update literal length + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_amd64_ll_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_amd64_ll_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_amd64_ll_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_amd64_ll_update_zero: + MOVQ AX, 24(SP) + + // Fill bitreader for state updates + MOVQ R13, (SP) + MOVQ R9, AX + SHRQ $0x08, AX + MOVBQZX AL, AX + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decodeSync_amd64_skip_update + + // Update Literal Length State + MOVBQZX DI, R13 + SHRQ $0x10, DI + MOVWQZX DI, DI + LEAQ (BX)(R13*1), CX + MOVQ DX, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 + ADDQ R14, DI + + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(DI*8), DI + + // Update Match Length State + MOVBQZX R8, R13 + SHRQ $0x10, R8 + MOVWQZX R8, R8 + LEAQ (BX)(R13*1), CX + MOVQ DX, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 + ADDQ R14, R8 + + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(R8*8), R8 + + // Update Offset State + MOVBQZX R9, R13 + SHRQ $0x10, R9 + MOVWQZX R9, R9 + LEAQ (BX)(R13*1), CX + MOVQ DX, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 + ADDQ R14, R9 + + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R9*8), R9 + +sequenceDecs_decodeSync_amd64_skip_update: + // Adjust offset + MOVQ s+0(FP), CX + MOVQ 8(SP), R13 + CMPQ AX, $0x01 + JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0 + MOVUPS 144(CX), X0 + MOVQ R13, 144(CX) + MOVUPS X0, 152(CX) + JMP sequenceDecs_decodeSync_amd64_after_adjust + +sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0: + CMPQ 24(SP), $0x00000000 + JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero + INCQ R13 + JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero + +sequenceDecs_decodeSync_amd64_adjust_offset_maybezero: + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero + MOVQ 144(CX), R13 + JMP sequenceDecs_decodeSync_amd64_after_adjust + +sequenceDecs_decodeSync_amd64_adjust_offset_nonzero: + MOVQ R13, AX + XORQ R14, R14 + MOVQ $-1, R15 + CMPQ R13, $0x03 + CMOVQEQ R14, AX + CMOVQEQ R15, R14 + ADDQ 144(CX)(AX*8), R14 + JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid + MOVQ $0x00000001, R14 + +sequenceDecs_decodeSync_amd64_adjust_temp_valid: + CMPQ R13, $0x01 + JZ sequenceDecs_decodeSync_amd64_adjust_skip + MOVQ 152(CX), AX + MOVQ AX, 160(CX) + +sequenceDecs_decodeSync_amd64_adjust_skip: + MOVQ 144(CX), AX + MOVQ AX, 152(CX) + MOVQ R14, 144(CX) + MOVQ R14, R13 + +sequenceDecs_decodeSync_amd64_after_adjust: + MOVQ R13, 8(SP) + + // Check values + MOVQ 16(SP), AX + MOVQ 24(SP), CX + LEAQ (AX)(CX*1), R14 + MOVQ s+0(FP), R15 + ADDQ R14, 256(R15) + MOVQ ctx+16(FP), R14 + SUBQ CX, 104(R14) + JS error_not_enough_literals + CMPQ AX, $0x00020002 + JA sequenceDecs_decodeSync_amd64_error_match_len_too_big + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok + TESTQ AX, AX + JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch + +sequenceDecs_decodeSync_amd64_match_len_ofs_ok: + MOVQ 24(SP), AX + MOVQ 8(SP), CX + MOVQ 16(SP), R13 + + // Check if we have enough space in s.out + LEAQ (AX)(R13*1), R14 + ADDQ R10, R14 + CMPQ R14, 32(SP) + JA error_not_enough_space + + // Copy literals + TESTQ AX, AX + JZ check_offset + XORQ R14, R14 + +copy_1: + MOVUPS (R11)(R14*1), X0 + MOVUPS X0, (R10)(R14*1) + ADDQ $0x10, R14 + CMPQ R14, AX + JB copy_1 + ADDQ AX, R11 + ADDQ AX, R10 + ADDQ AX, R12 + + // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) +check_offset: + MOVQ R12, AX + ADDQ 40(SP), AX + CMPQ CX, AX + JG error_match_off_too_big + CMPQ CX, 56(SP) + JG error_match_off_too_big + + // Copy match from history + MOVQ CX, AX + SUBQ R12, AX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ AX, R14 + CMPQ R13, AX + JG copy_all_from_history + MOVQ R13, AX + SUBQ $0x10, AX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, AX + JAE copy_4_loop + LEAQ 16(R14)(AX*1), R14 + LEAQ 16(R10)(AX*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), AX + MOVB 2(R14), CL + MOVW AX, (R10) + MOVB CL, 2(R10) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), AX + MOVL -4(R14)(R13*1), CX + MOVL AX, (R10) + MOVL CX, -4(R10)(R13*1) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), AX + MOVQ -8(R14)(R13*1), CX + MOVQ AX, (R10) + MOVQ CX, -8(R10)(R13*1) + ADDQ R13, R14 + ADDQ R13, R10 + +copy_4_end: + ADDQ R13, R12 + JMP handle_loop + JMP loop_finished + +copy_all_from_history: + MOVQ AX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R10)(R15*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_5_end + +copy_5_small: + CMPQ AX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ AX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(AX*1), BP + MOVB R15, (R10) + MOVB BP, -1(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R10) + MOVB BP, 2(R10) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(AX*1), BP + MOVL R15, (R10) + MOVL BP, -4(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(AX*1), BP + MOVQ R15, (R10) + MOVQ BP, -8(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + +copy_5_end: + ADDQ AX, R12 + SUBQ AX, R13 + + // Copy match from the current buffer +copy_match: + MOVQ R10, AX + SUBQ CX, AX + + // ml <= mo + CMPQ R13, CX + JA copy_overlapping_match + + // Copy non-overlapping match + ADDQ R13, R12 + MOVQ R10, CX + ADDQ R13, R10 + +copy_2: + MOVUPS (AX), X0 + MOVUPS X0, (CX) + ADDQ $0x10, AX + ADDQ $0x10, CX + SUBQ $0x10, R13 + JHI copy_2 + JMP handle_loop + + // Copy overlapping match +copy_overlapping_match: + ADDQ R13, R12 + +copy_slow_3: + MOVB (AX), CL + MOVB CL, (R10) + INCQ AX + INCQ R10 + DECQ R13 + JNZ copy_slow_3 + +handle_loop: + MOVQ ctx+16(FP), AX + DECQ 96(AX) + JNS sequenceDecs_decodeSync_amd64_main_loop + +loop_finished: + MOVQ br+8(FP), AX + MOVQ DX, 32(AX) + MOVB BL, 40(AX) + MOVQ SI, 24(AX) + + // Update the context + MOVQ ctx+16(FP), AX + MOVQ R12, 136(AX) + MOVQ 144(AX), CX + SUBQ CX, R11 + MOVQ R11, 168(AX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch: + MOVQ 16(SP), AX + MOVQ ctx+16(FP), CX + MOVQ AX, 216(CX) + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decodeSync_amd64_error_match_len_too_big: + MOVQ ctx+16(FP), AX + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error +error_match_off_too_big: + MOVQ ctx+16(FP), AX + MOVQ 8(SP), CX + MOVQ CX, 224(AX) + MOVQ R12, 136(AX) + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error +error_not_enough_space: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ R12, 136(AX) + MOVQ $0x00000005, ret+24(FP) + RET + +// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int +// Requires: BMI, BMI2, CMOV, SSE +TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32 + MOVQ br+8(FP), CX + MOVQ 32(CX), AX + MOVBQZX 40(CX), DX + MOVQ 24(CX), BX + MOVQ (CX), CX + ADDQ BX, CX + MOVQ CX, (SP) + MOVQ ctx+16(FP), CX + MOVQ 72(CX), SI + MOVQ 80(CX), DI + MOVQ 88(CX), R8 + XORQ R9, R9 + MOVQ R9, 8(SP) + MOVQ R9, 16(SP) + MOVQ R9, 24(SP) + MOVQ 112(CX), R9 + MOVQ 128(CX), R10 + MOVQ R10, 32(SP) + MOVQ 144(CX), R10 + MOVQ 136(CX), R11 + MOVQ 200(CX), R12 + MOVQ R12, 56(SP) + MOVQ 176(CX), R12 + MOVQ R12, 48(SP) + MOVQ 184(CX), CX + MOVQ CX, 40(SP) + MOVQ 40(SP), CX + ADDQ CX, 48(SP) + + // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) + ADDQ R9, 32(SP) + + // outBase += outPosition + ADDQ R11, R9 + +sequenceDecs_decodeSync_bmi2_main_loop: + MOVQ (SP), R12 + + // Fill bitreader to have enough for the offset and match length. + CMPQ BX, $0x08 + JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R12 + MOVQ (R12), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decodeSync_bmi2_fill_end + +sequenceDecs_decodeSync_bmi2_fill_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decodeSync_bmi2_fill_end + CMPQ DX, $0x07 + JLE sequenceDecs_decodeSync_bmi2_fill_end + SHLQ $0x08, AX + SUBQ $0x01, R12 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R12), CX + ORQ CX, AX + JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte + +sequenceDecs_decodeSync_bmi2_fill_end: + // Update offset + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R13 + MOVQ AX, R14 + LEAQ (DX)(R13*1), CX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + MOVQ CX, DX + MOVQ R8, CX + SHRQ $0x20, CX + ADDQ R14, CX + MOVQ CX, 8(SP) + + // Update match length + MOVQ $0x00000808, CX + BEXTRQ CX, DI, R13 + MOVQ AX, R14 + LEAQ (DX)(R13*1), CX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + MOVQ CX, DX + MOVQ DI, CX + SHRQ $0x20, CX + ADDQ R14, CX + MOVQ CX, 16(SP) + + // Fill bitreader to have enough for the remaining + CMPQ BX, $0x08 + JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R12 + MOVQ (R12), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decodeSync_bmi2_fill_2_end + +sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decodeSync_bmi2_fill_2_end + CMPQ DX, $0x07 + JLE sequenceDecs_decodeSync_bmi2_fill_2_end + SHLQ $0x08, AX + SUBQ $0x01, R12 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R12), CX + ORQ CX, AX + JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte + +sequenceDecs_decodeSync_bmi2_fill_2_end: + // Update literal length + MOVQ $0x00000808, CX + BEXTRQ CX, SI, R13 + MOVQ AX, R14 + LEAQ (DX)(R13*1), CX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + MOVQ CX, DX + MOVQ SI, CX + SHRQ $0x20, CX + ADDQ R14, CX + MOVQ CX, 24(SP) + + // Fill bitreader for state updates + MOVQ R12, (SP) + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R12 + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decodeSync_bmi2_skip_update + LEAQ (SI)(DI*1), R13 + ADDQ R8, R13 + MOVBQZX R13, R13 + LEAQ (DX)(R13*1), CX + MOVQ AX, R14 + MOVQ CX, DX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + + // Update Offset State + BZHIQ R8, R14, CX + SHRXQ R8, R14, R14 + MOVQ $0x00001010, R13 + BEXTRQ R13, R8, R8 + ADDQ CX, R8 + + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R8*8), R8 + + // Update Match Length State + BZHIQ DI, R14, CX + SHRXQ DI, R14, R14 + MOVQ $0x00001010, R13 + BEXTRQ R13, DI, DI + ADDQ CX, DI + + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(DI*8), DI + + // Update Literal Length State + BZHIQ SI, R14, CX + MOVQ $0x00001010, R13 + BEXTRQ R13, SI, SI + ADDQ CX, SI + + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(SI*8), SI + +sequenceDecs_decodeSync_bmi2_skip_update: + // Adjust offset + MOVQ s+0(FP), CX + MOVQ 8(SP), R13 + CMPQ R12, $0x01 + JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0 + MOVUPS 144(CX), X0 + MOVQ R13, 144(CX) + MOVUPS X0, 152(CX) + JMP sequenceDecs_decodeSync_bmi2_after_adjust + +sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0: + CMPQ 24(SP), $0x00000000 + JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero + INCQ R13 + JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero + +sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero: + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero + MOVQ 144(CX), R13 + JMP sequenceDecs_decodeSync_bmi2_after_adjust + +sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero: + MOVQ R13, R12 + XORQ R14, R14 + MOVQ $-1, R15 + CMPQ R13, $0x03 + CMOVQEQ R14, R12 + CMOVQEQ R15, R14 + ADDQ 144(CX)(R12*8), R14 + JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid + MOVQ $0x00000001, R14 + +sequenceDecs_decodeSync_bmi2_adjust_temp_valid: + CMPQ R13, $0x01 + JZ sequenceDecs_decodeSync_bmi2_adjust_skip + MOVQ 152(CX), R12 + MOVQ R12, 160(CX) + +sequenceDecs_decodeSync_bmi2_adjust_skip: + MOVQ 144(CX), R12 + MOVQ R12, 152(CX) + MOVQ R14, 144(CX) + MOVQ R14, R13 + +sequenceDecs_decodeSync_bmi2_after_adjust: + MOVQ R13, 8(SP) + + // Check values + MOVQ 16(SP), CX + MOVQ 24(SP), R12 + LEAQ (CX)(R12*1), R14 + MOVQ s+0(FP), R15 + ADDQ R14, 256(R15) + MOVQ ctx+16(FP), R14 + SUBQ R12, 104(R14) + JS error_not_enough_literals + CMPQ CX, $0x00020002 + JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok + TESTQ CX, CX + JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch + +sequenceDecs_decodeSync_bmi2_match_len_ofs_ok: + MOVQ 24(SP), CX + MOVQ 8(SP), R12 + MOVQ 16(SP), R13 + + // Check if we have enough space in s.out + LEAQ (CX)(R13*1), R14 + ADDQ R9, R14 + CMPQ R14, 32(SP) + JA error_not_enough_space + + // Copy literals + TESTQ CX, CX + JZ check_offset + XORQ R14, R14 + +copy_1: + MOVUPS (R10)(R14*1), X0 + MOVUPS X0, (R9)(R14*1) + ADDQ $0x10, R14 + CMPQ R14, CX + JB copy_1 + ADDQ CX, R10 + ADDQ CX, R9 + ADDQ CX, R11 + + // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) +check_offset: + MOVQ R11, CX + ADDQ 40(SP), CX + CMPQ R12, CX + JG error_match_off_too_big + CMPQ R12, 56(SP) + JG error_match_off_too_big + + // Copy match from history + MOVQ R12, CX + SUBQ R11, CX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ CX, R14 + CMPQ R13, CX + JG copy_all_from_history + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, CX + JAE copy_4_loop + LEAQ 16(R14)(CX*1), R14 + LEAQ 16(R9)(CX*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), CX + MOVB 2(R14), R12 + MOVW CX, (R9) + MOVB R12, 2(R9) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), CX + MOVL -4(R14)(R13*1), R12 + MOVL CX, (R9) + MOVL R12, -4(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), CX + MOVQ -8(R14)(R13*1), R12 + MOVQ CX, (R9) + MOVQ R12, -8(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 + +copy_4_end: + ADDQ R13, R11 + JMP handle_loop + JMP loop_finished + +copy_all_from_history: + MOVQ CX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R9)(R15*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_5_end + +copy_5_small: + CMPQ CX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ CX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(CX*1), BP + MOVB R15, (R9) + MOVB BP, -1(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R9) + MOVB BP, 2(R9) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(CX*1), BP + MOVL R15, (R9) + MOVL BP, -4(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(CX*1), BP + MOVQ R15, (R9) + MOVQ BP, -8(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + +copy_5_end: + ADDQ CX, R11 + SUBQ CX, R13 + + // Copy match from the current buffer +copy_match: + MOVQ R9, CX + SUBQ R12, CX + + // ml <= mo + CMPQ R13, R12 + JA copy_overlapping_match + + // Copy non-overlapping match + ADDQ R13, R11 + MOVQ R9, R12 + ADDQ R13, R9 + +copy_2: + MOVUPS (CX), X0 + MOVUPS X0, (R12) + ADDQ $0x10, CX + ADDQ $0x10, R12 + SUBQ $0x10, R13 + JHI copy_2 + JMP handle_loop + + // Copy overlapping match +copy_overlapping_match: + ADDQ R13, R11 + +copy_slow_3: + MOVB (CX), R12 + MOVB R12, (R9) + INCQ CX + INCQ R9 + DECQ R13 + JNZ copy_slow_3 + +handle_loop: + MOVQ ctx+16(FP), CX + DECQ 96(CX) + JNS sequenceDecs_decodeSync_bmi2_main_loop + +loop_finished: + MOVQ br+8(FP), CX + MOVQ AX, 32(CX) + MOVB DL, 40(CX) + MOVQ BX, 24(CX) + + // Update the context + MOVQ ctx+16(FP), AX + MOVQ R11, 136(AX) + MOVQ 144(AX), CX + SUBQ CX, R10 + MOVQ R10, 168(AX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch: + MOVQ 16(SP), AX + MOVQ ctx+16(FP), CX + MOVQ AX, 216(CX) + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decodeSync_bmi2_error_match_len_too_big: + MOVQ ctx+16(FP), AX + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error +error_match_off_too_big: + MOVQ ctx+16(FP), AX + MOVQ 8(SP), CX + MOVQ CX, 224(AX) + MOVQ R11, 136(AX) + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error +error_not_enough_space: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ R11, 136(AX) + MOVQ $0x00000005, ret+24(FP) + RET + +// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int +// Requires: CMOV, SSE +TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32 + MOVQ br+8(FP), AX + MOVQ 32(AX), DX + MOVBQZX 40(AX), BX + MOVQ 24(AX), SI + MOVQ (AX), AX + ADDQ SI, AX + MOVQ AX, (SP) + MOVQ ctx+16(FP), AX + MOVQ 72(AX), DI + MOVQ 80(AX), R8 + MOVQ 88(AX), R9 + XORQ CX, CX + MOVQ CX, 8(SP) + MOVQ CX, 16(SP) + MOVQ CX, 24(SP) + MOVQ 112(AX), R10 + MOVQ 128(AX), CX + MOVQ CX, 32(SP) + MOVQ 144(AX), R11 + MOVQ 136(AX), R12 + MOVQ 200(AX), CX + MOVQ CX, 56(SP) + MOVQ 176(AX), CX + MOVQ CX, 48(SP) + MOVQ 184(AX), AX + MOVQ AX, 40(SP) + MOVQ 40(SP), AX + ADDQ AX, 48(SP) + + // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) + ADDQ R10, 32(SP) + + // outBase += outPosition + ADDQ R12, R10 + +sequenceDecs_decodeSync_safe_amd64_main_loop: + MOVQ (SP), R13 + + // Fill bitreader to have enough for the offset and match length. + CMPQ SI, $0x08 + JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R13 + MOVQ (R13), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decodeSync_safe_amd64_fill_end + +sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decodeSync_safe_amd64_fill_end + CMPQ BX, $0x07 + JLE sequenceDecs_decodeSync_safe_amd64_fill_end + SHLQ $0x08, DX + SUBQ $0x01, R13 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R13), AX + ORQ AX, DX + JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte + +sequenceDecs_decodeSync_safe_amd64_fill_end: + // Update offset + MOVQ R9, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_safe_amd64_of_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_safe_amd64_of_update_zero: + MOVQ AX, 8(SP) + + // Update match length + MOVQ R8, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_safe_amd64_ml_update_zero: + MOVQ AX, 16(SP) + + // Fill bitreader to have enough for the remaining + CMPQ SI, $0x08 + JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte + MOVQ BX, AX + SHRQ $0x03, AX + SUBQ AX, R13 + MOVQ (R13), DX + SUBQ AX, SI + ANDQ $0x07, BX + JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end + +sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte: + CMPQ SI, $0x00 + JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end + CMPQ BX, $0x07 + JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end + SHLQ $0x08, DX + SUBQ $0x01, R13 + SUBQ $0x01, SI + SUBQ $0x08, BX + MOVBQZX (R13), AX + ORQ AX, DX + JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte + +sequenceDecs_decodeSync_safe_amd64_fill_2_end: + // Update literal length + MOVQ DI, AX + MOVQ BX, CX + MOVQ DX, R14 + SHLQ CL, R14 + MOVB AH, CL + SHRQ $0x20, AX + TESTQ CX, CX + JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero + ADDQ CX, BX + CMPQ BX, $0x40 + JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero + CMPQ CX, $0x40 + JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero + NEGQ CX + SHRQ CL, R14 + ADDQ R14, AX + +sequenceDecs_decodeSync_safe_amd64_ll_update_zero: + MOVQ AX, 24(SP) + + // Fill bitreader for state updates + MOVQ R13, (SP) + MOVQ R9, AX + SHRQ $0x08, AX + MOVBQZX AL, AX + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decodeSync_safe_amd64_skip_update + + // Update Literal Length State + MOVBQZX DI, R13 + SHRQ $0x10, DI + MOVWQZX DI, DI + LEAQ (BX)(R13*1), CX + MOVQ DX, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 + ADDQ R14, DI + + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(DI*8), DI + + // Update Match Length State + MOVBQZX R8, R13 + SHRQ $0x10, R8 + MOVWQZX R8, R8 + LEAQ (BX)(R13*1), CX + MOVQ DX, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 + ADDQ R14, R8 + + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(R8*8), R8 + + // Update Offset State + MOVBQZX R9, R13 + SHRQ $0x10, R9 + MOVWQZX R9, R9 + LEAQ (BX)(R13*1), CX + MOVQ DX, R14 + MOVQ CX, BX + ROLQ CL, R14 + MOVL $0x00000001, R15 + MOVB R13, CL + SHLL CL, R15 + DECL R15 + ANDQ R15, R14 + ADDQ R14, R9 + + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R9*8), R9 + +sequenceDecs_decodeSync_safe_amd64_skip_update: + // Adjust offset + MOVQ s+0(FP), CX + MOVQ 8(SP), R13 + CMPQ AX, $0x01 + JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0 + MOVUPS 144(CX), X0 + MOVQ R13, 144(CX) + MOVUPS X0, 152(CX) + JMP sequenceDecs_decodeSync_safe_amd64_after_adjust + +sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0: + CMPQ 24(SP), $0x00000000 + JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero + INCQ R13 + JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero + +sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero: + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero + MOVQ 144(CX), R13 + JMP sequenceDecs_decodeSync_safe_amd64_after_adjust + +sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero: + MOVQ R13, AX + XORQ R14, R14 + MOVQ $-1, R15 + CMPQ R13, $0x03 + CMOVQEQ R14, AX + CMOVQEQ R15, R14 + ADDQ 144(CX)(AX*8), R14 + JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid + MOVQ $0x00000001, R14 + +sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid: + CMPQ R13, $0x01 + JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip + MOVQ 152(CX), AX + MOVQ AX, 160(CX) + +sequenceDecs_decodeSync_safe_amd64_adjust_skip: + MOVQ 144(CX), AX + MOVQ AX, 152(CX) + MOVQ R14, 144(CX) + MOVQ R14, R13 + +sequenceDecs_decodeSync_safe_amd64_after_adjust: + MOVQ R13, 8(SP) + + // Check values + MOVQ 16(SP), AX + MOVQ 24(SP), CX + LEAQ (AX)(CX*1), R14 + MOVQ s+0(FP), R15 + ADDQ R14, 256(R15) + MOVQ ctx+16(FP), R14 + SUBQ CX, 104(R14) + JS error_not_enough_literals + CMPQ AX, $0x00020002 + JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok + TESTQ AX, AX + JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch + +sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok: + MOVQ 24(SP), AX + MOVQ 8(SP), CX + MOVQ 16(SP), R13 + + // Check if we have enough space in s.out + LEAQ (AX)(R13*1), R14 + ADDQ R10, R14 + CMPQ R14, 32(SP) + JA error_not_enough_space + + // Copy literals + TESTQ AX, AX + JZ check_offset + MOVQ AX, R14 + SUBQ $0x10, R14 + JB copy_1_small + +copy_1_loop: + MOVUPS (R11), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R11 + ADDQ $0x10, R10 + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(R11)(R14*1), R11 + LEAQ 16(R10)(R14*1), R10 + MOVUPS -16(R11), X0 + MOVUPS X0, -16(R10) + JMP copy_1_end + +copy_1_small: + CMPQ AX, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ AX, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (R11), R14 + MOVB -1(R11)(AX*1), R15 + MOVB R14, (R10) + MOVB R15, -1(R10)(AX*1) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end + +copy_1_move_3: + MOVW (R11), R14 + MOVB 2(R11), R15 + MOVW R14, (R10) + MOVB R15, 2(R10) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end + +copy_1_move_4through7: + MOVL (R11), R14 + MOVL -4(R11)(AX*1), R15 + MOVL R14, (R10) + MOVL R15, -4(R10)(AX*1) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end + +copy_1_move_8through16: + MOVQ (R11), R14 + MOVQ -8(R11)(AX*1), R15 + MOVQ R14, (R10) + MOVQ R15, -8(R10)(AX*1) + ADDQ AX, R11 + ADDQ AX, R10 + +copy_1_end: + ADDQ AX, R12 + + // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) +check_offset: + MOVQ R12, AX + ADDQ 40(SP), AX + CMPQ CX, AX + JG error_match_off_too_big + CMPQ CX, 56(SP) + JG error_match_off_too_big + + // Copy match from history + MOVQ CX, AX + SUBQ R12, AX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ AX, R14 + CMPQ R13, AX + JG copy_all_from_history + MOVQ R13, AX + SUBQ $0x10, AX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, AX + JAE copy_4_loop + LEAQ 16(R14)(AX*1), R14 + LEAQ 16(R10)(AX*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), AX + MOVB 2(R14), CL + MOVW AX, (R10) + MOVB CL, 2(R10) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), AX + MOVL -4(R14)(R13*1), CX + MOVL AX, (R10) + MOVL CX, -4(R10)(R13*1) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), AX + MOVQ -8(R14)(R13*1), CX + MOVQ AX, (R10) + MOVQ CX, -8(R10)(R13*1) + ADDQ R13, R14 + ADDQ R13, R10 + +copy_4_end: + ADDQ R13, R12 + JMP handle_loop + JMP loop_finished + +copy_all_from_history: + MOVQ AX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R10) + ADDQ $0x10, R14 + ADDQ $0x10, R10 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R10)(R15*1), R10 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R10) + JMP copy_5_end + +copy_5_small: + CMPQ AX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ AX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(AX*1), BP + MOVB R15, (R10) + MOVB BP, -1(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R10) + MOVB BP, 2(R10) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(AX*1), BP + MOVL R15, (R10) + MOVL BP, -4(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(AX*1), BP + MOVQ R15, (R10) + MOVQ BP, -8(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + +copy_5_end: + ADDQ AX, R12 + SUBQ AX, R13 + + // Copy match from the current buffer +copy_match: + MOVQ R10, AX + SUBQ CX, AX + + // ml <= mo + CMPQ R13, CX + JA copy_overlapping_match + + // Copy non-overlapping match + ADDQ R13, R12 + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_2_small + +copy_2_loop: + MOVUPS (AX), X0 + MOVUPS X0, (R10) + ADDQ $0x10, AX + ADDQ $0x10, R10 + SUBQ $0x10, CX + JAE copy_2_loop + LEAQ 16(AX)(CX*1), AX + LEAQ 16(R10)(CX*1), R10 + MOVUPS -16(AX), X0 + MOVUPS X0, -16(R10) + JMP copy_2_end + +copy_2_small: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (AX), CL + MOVB -1(AX)(R13*1), R14 + MOVB CL, (R10) + MOVB R14, -1(R10)(R13*1) + ADDQ R13, AX + ADDQ R13, R10 + JMP copy_2_end + +copy_2_move_3: + MOVW (AX), CX + MOVB 2(AX), R14 + MOVW CX, (R10) + MOVB R14, 2(R10) + ADDQ R13, AX + ADDQ R13, R10 + JMP copy_2_end + +copy_2_move_4through7: + MOVL (AX), CX + MOVL -4(AX)(R13*1), R14 + MOVL CX, (R10) + MOVL R14, -4(R10)(R13*1) + ADDQ R13, AX + ADDQ R13, R10 + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (AX), CX + MOVQ -8(AX)(R13*1), R14 + MOVQ CX, (R10) + MOVQ R14, -8(R10)(R13*1) + ADDQ R13, AX + ADDQ R13, R10 + +copy_2_end: + JMP handle_loop + + // Copy overlapping match +copy_overlapping_match: + ADDQ R13, R12 + +copy_slow_3: + MOVB (AX), CL + MOVB CL, (R10) + INCQ AX + INCQ R10 + DECQ R13 + JNZ copy_slow_3 + +handle_loop: + MOVQ ctx+16(FP), AX + DECQ 96(AX) + JNS sequenceDecs_decodeSync_safe_amd64_main_loop + +loop_finished: + MOVQ br+8(FP), AX + MOVQ DX, 32(AX) + MOVB BL, 40(AX) + MOVQ SI, 24(AX) + + // Update the context + MOVQ ctx+16(FP), AX + MOVQ R12, 136(AX) + MOVQ 144(AX), CX + SUBQ CX, R11 + MOVQ R11, 168(AX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch: + MOVQ 16(SP), AX + MOVQ ctx+16(FP), CX + MOVQ AX, 216(CX) + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big: + MOVQ ctx+16(FP), AX + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error +error_match_off_too_big: + MOVQ ctx+16(FP), AX + MOVQ 8(SP), CX + MOVQ CX, 224(AX) + MOVQ R12, 136(AX) + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error +error_not_enough_space: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ R12, 136(AX) + MOVQ $0x00000005, ret+24(FP) + RET + +// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int +// Requires: BMI, BMI2, CMOV, SSE +TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32 + MOVQ br+8(FP), CX + MOVQ 32(CX), AX + MOVBQZX 40(CX), DX + MOVQ 24(CX), BX + MOVQ (CX), CX + ADDQ BX, CX + MOVQ CX, (SP) + MOVQ ctx+16(FP), CX + MOVQ 72(CX), SI + MOVQ 80(CX), DI + MOVQ 88(CX), R8 + XORQ R9, R9 + MOVQ R9, 8(SP) + MOVQ R9, 16(SP) + MOVQ R9, 24(SP) + MOVQ 112(CX), R9 + MOVQ 128(CX), R10 + MOVQ R10, 32(SP) + MOVQ 144(CX), R10 + MOVQ 136(CX), R11 + MOVQ 200(CX), R12 + MOVQ R12, 56(SP) + MOVQ 176(CX), R12 + MOVQ R12, 48(SP) + MOVQ 184(CX), CX + MOVQ CX, 40(SP) + MOVQ 40(SP), CX + ADDQ CX, 48(SP) + + // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) + ADDQ R9, 32(SP) + + // outBase += outPosition + ADDQ R11, R9 + +sequenceDecs_decodeSync_safe_bmi2_main_loop: + MOVQ (SP), R12 + + // Fill bitreader to have enough for the offset and match length. + CMPQ BX, $0x08 + JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R12 + MOVQ (R12), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decodeSync_safe_bmi2_fill_end + +sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decodeSync_safe_bmi2_fill_end + CMPQ DX, $0x07 + JLE sequenceDecs_decodeSync_safe_bmi2_fill_end + SHLQ $0x08, AX + SUBQ $0x01, R12 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R12), CX + ORQ CX, AX + JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte + +sequenceDecs_decodeSync_safe_bmi2_fill_end: + // Update offset + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R13 + MOVQ AX, R14 + LEAQ (DX)(R13*1), CX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + MOVQ CX, DX + MOVQ R8, CX + SHRQ $0x20, CX + ADDQ R14, CX + MOVQ CX, 8(SP) + + // Update match length + MOVQ $0x00000808, CX + BEXTRQ CX, DI, R13 + MOVQ AX, R14 + LEAQ (DX)(R13*1), CX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + MOVQ CX, DX + MOVQ DI, CX + SHRQ $0x20, CX + ADDQ R14, CX + MOVQ CX, 16(SP) + + // Fill bitreader to have enough for the remaining + CMPQ BX, $0x08 + JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte + MOVQ DX, CX + SHRQ $0x03, CX + SUBQ CX, R12 + MOVQ (R12), AX + SUBQ CX, BX + ANDQ $0x07, DX + JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end + +sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte: + CMPQ BX, $0x00 + JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end + CMPQ DX, $0x07 + JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end + SHLQ $0x08, AX + SUBQ $0x01, R12 + SUBQ $0x01, BX + SUBQ $0x08, DX + MOVBQZX (R12), CX + ORQ CX, AX + JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte + +sequenceDecs_decodeSync_safe_bmi2_fill_2_end: + // Update literal length + MOVQ $0x00000808, CX + BEXTRQ CX, SI, R13 + MOVQ AX, R14 + LEAQ (DX)(R13*1), CX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + MOVQ CX, DX + MOVQ SI, CX + SHRQ $0x20, CX + ADDQ R14, CX + MOVQ CX, 24(SP) + + // Fill bitreader for state updates + MOVQ R12, (SP) + MOVQ $0x00000808, CX + BEXTRQ CX, R8, R12 + MOVQ ctx+16(FP), CX + CMPQ 96(CX), $0x00 + JZ sequenceDecs_decodeSync_safe_bmi2_skip_update + LEAQ (SI)(DI*1), R13 + ADDQ R8, R13 + MOVBQZX R13, R13 + LEAQ (DX)(R13*1), CX + MOVQ AX, R14 + MOVQ CX, DX + ROLQ CL, R14 + BZHIQ R13, R14, R14 + + // Update Offset State + BZHIQ R8, R14, CX + SHRXQ R8, R14, R14 + MOVQ $0x00001010, R13 + BEXTRQ R13, R8, R8 + ADDQ CX, R8 + + // Load ctx.ofTable + MOVQ ctx+16(FP), CX + MOVQ 48(CX), CX + MOVQ (CX)(R8*8), R8 + + // Update Match Length State + BZHIQ DI, R14, CX + SHRXQ DI, R14, R14 + MOVQ $0x00001010, R13 + BEXTRQ R13, DI, DI + ADDQ CX, DI + + // Load ctx.mlTable + MOVQ ctx+16(FP), CX + MOVQ 24(CX), CX + MOVQ (CX)(DI*8), DI + + // Update Literal Length State + BZHIQ SI, R14, CX + MOVQ $0x00001010, R13 + BEXTRQ R13, SI, SI + ADDQ CX, SI + + // Load ctx.llTable + MOVQ ctx+16(FP), CX + MOVQ (CX), CX + MOVQ (CX)(SI*8), SI + +sequenceDecs_decodeSync_safe_bmi2_skip_update: + // Adjust offset + MOVQ s+0(FP), CX + MOVQ 8(SP), R13 + CMPQ R12, $0x01 + JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0 + MOVUPS 144(CX), X0 + MOVQ R13, 144(CX) + MOVUPS X0, 152(CX) + JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust + +sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0: + CMPQ 24(SP), $0x00000000 + JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero + INCQ R13 + JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero + +sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero: + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero + MOVQ 144(CX), R13 + JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust + +sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero: + MOVQ R13, R12 + XORQ R14, R14 + MOVQ $-1, R15 + CMPQ R13, $0x03 + CMOVQEQ R14, R12 + CMOVQEQ R15, R14 + ADDQ 144(CX)(R12*8), R14 + JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid + MOVQ $0x00000001, R14 + +sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid: + CMPQ R13, $0x01 + JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip + MOVQ 152(CX), R12 + MOVQ R12, 160(CX) + +sequenceDecs_decodeSync_safe_bmi2_adjust_skip: + MOVQ 144(CX), R12 + MOVQ R12, 152(CX) + MOVQ R14, 144(CX) + MOVQ R14, R13 + +sequenceDecs_decodeSync_safe_bmi2_after_adjust: + MOVQ R13, 8(SP) + + // Check values + MOVQ 16(SP), CX + MOVQ 24(SP), R12 + LEAQ (CX)(R12*1), R14 + MOVQ s+0(FP), R15 + ADDQ R14, 256(R15) + MOVQ ctx+16(FP), R14 + SUBQ R12, 104(R14) + JS error_not_enough_literals + CMPQ CX, $0x00020002 + JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big + TESTQ R13, R13 + JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok + TESTQ CX, CX + JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch + +sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok: + MOVQ 24(SP), CX + MOVQ 8(SP), R12 + MOVQ 16(SP), R13 + + // Check if we have enough space in s.out + LEAQ (CX)(R13*1), R14 + ADDQ R9, R14 + CMPQ R14, 32(SP) + JA error_not_enough_space + + // Copy literals + TESTQ CX, CX + JZ check_offset + MOVQ CX, R14 + SUBQ $0x10, R14 + JB copy_1_small + +copy_1_loop: + MOVUPS (R10), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R10 + ADDQ $0x10, R9 + SUBQ $0x10, R14 + JAE copy_1_loop + LEAQ 16(R10)(R14*1), R10 + LEAQ 16(R9)(R14*1), R9 + MOVUPS -16(R10), X0 + MOVUPS X0, -16(R9) + JMP copy_1_end + +copy_1_small: + CMPQ CX, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ CX, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (R10), R14 + MOVB -1(R10)(CX*1), R15 + MOVB R14, (R9) + MOVB R15, -1(R9)(CX*1) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end + +copy_1_move_3: + MOVW (R10), R14 + MOVB 2(R10), R15 + MOVW R14, (R9) + MOVB R15, 2(R9) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end + +copy_1_move_4through7: + MOVL (R10), R14 + MOVL -4(R10)(CX*1), R15 + MOVL R14, (R9) + MOVL R15, -4(R9)(CX*1) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end + +copy_1_move_8through16: + MOVQ (R10), R14 + MOVQ -8(R10)(CX*1), R15 + MOVQ R14, (R9) + MOVQ R15, -8(R9)(CX*1) + ADDQ CX, R10 + ADDQ CX, R9 + +copy_1_end: + ADDQ CX, R11 + + // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) +check_offset: + MOVQ R11, CX + ADDQ 40(SP), CX + CMPQ R12, CX + JG error_match_off_too_big + CMPQ R12, 56(SP) + JG error_match_off_too_big + + // Copy match from history + MOVQ R12, CX + SUBQ R11, CX + JLS copy_match + MOVQ 48(SP), R14 + SUBQ CX, R14 + CMPQ R13, CX + JG copy_all_from_history + MOVQ R13, CX + SUBQ $0x10, CX + JB copy_4_small + +copy_4_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, CX + JAE copy_4_loop + LEAQ 16(R14)(CX*1), R14 + LEAQ 16(R9)(CX*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_4_end + +copy_4_small: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), CX + MOVB 2(R14), R12 + MOVW CX, (R9) + MOVB R12, 2(R9) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), CX + MOVL -4(R14)(R13*1), R12 + MOVL CX, (R9) + MOVL R12, -4(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), CX + MOVQ -8(R14)(R13*1), R12 + MOVQ CX, (R9) + MOVQ R12, -8(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 + +copy_4_end: + ADDQ R13, R11 + JMP handle_loop + JMP loop_finished + +copy_all_from_history: + MOVQ CX, R15 + SUBQ $0x10, R15 + JB copy_5_small + +copy_5_loop: + MOVUPS (R14), X0 + MOVUPS X0, (R9) + ADDQ $0x10, R14 + ADDQ $0x10, R9 + SUBQ $0x10, R15 + JAE copy_5_loop + LEAQ 16(R14)(R15*1), R14 + LEAQ 16(R9)(R15*1), R9 + MOVUPS -16(R14), X0 + MOVUPS X0, -16(R9) + JMP copy_5_end + +copy_5_small: + CMPQ CX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ CX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(CX*1), BP + MOVB R15, (R9) + MOVB BP, -1(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R9) + MOVB BP, 2(R9) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(CX*1), BP + MOVL R15, (R9) + MOVL BP, -4(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(CX*1), BP + MOVQ R15, (R9) + MOVQ BP, -8(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + +copy_5_end: + ADDQ CX, R11 + SUBQ CX, R13 + + // Copy match from the current buffer +copy_match: + MOVQ R9, CX + SUBQ R12, CX + + // ml <= mo + CMPQ R13, R12 + JA copy_overlapping_match + + // Copy non-overlapping match + ADDQ R13, R11 + MOVQ R13, R12 + SUBQ $0x10, R12 + JB copy_2_small + +copy_2_loop: + MOVUPS (CX), X0 + MOVUPS X0, (R9) + ADDQ $0x10, CX + ADDQ $0x10, R9 + SUBQ $0x10, R12 + JAE copy_2_loop + LEAQ 16(CX)(R12*1), CX + LEAQ 16(R9)(R12*1), R9 + MOVUPS -16(CX), X0 + MOVUPS X0, -16(R9) + JMP copy_2_end + +copy_2_small: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (CX), R12 + MOVB -1(CX)(R13*1), R14 + MOVB R12, (R9) + MOVB R14, -1(R9)(R13*1) + ADDQ R13, CX + ADDQ R13, R9 + JMP copy_2_end + +copy_2_move_3: + MOVW (CX), R12 + MOVB 2(CX), R14 + MOVW R12, (R9) + MOVB R14, 2(R9) + ADDQ R13, CX + ADDQ R13, R9 + JMP copy_2_end + +copy_2_move_4through7: + MOVL (CX), R12 + MOVL -4(CX)(R13*1), R14 + MOVL R12, (R9) + MOVL R14, -4(R9)(R13*1) + ADDQ R13, CX + ADDQ R13, R9 + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (CX), R12 + MOVQ -8(CX)(R13*1), R14 + MOVQ R12, (R9) + MOVQ R14, -8(R9)(R13*1) + ADDQ R13, CX + ADDQ R13, R9 + +copy_2_end: + JMP handle_loop + + // Copy overlapping match +copy_overlapping_match: + ADDQ R13, R11 + +copy_slow_3: + MOVB (CX), R12 + MOVB R12, (R9) + INCQ CX + INCQ R9 + DECQ R13 + JNZ copy_slow_3 + +handle_loop: + MOVQ ctx+16(FP), CX + DECQ 96(CX) + JNS sequenceDecs_decodeSync_safe_bmi2_main_loop + +loop_finished: + MOVQ br+8(FP), CX + MOVQ AX, 32(CX) + MOVB DL, 40(CX) + MOVQ BX, 24(CX) + + // Update the context + MOVQ ctx+16(FP), AX + MOVQ R11, 136(AX) + MOVQ 144(AX), CX + SUBQ CX, R10 + MOVQ R10, 168(AX) + + // Return success + MOVQ $0x00000000, ret+24(FP) + RET + + // Return with match length error +sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch: + MOVQ 16(SP), AX + MOVQ ctx+16(FP), CX + MOVQ AX, 216(CX) + MOVQ $0x00000001, ret+24(FP) + RET + + // Return with match too long error +sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big: + MOVQ ctx+16(FP), AX + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ $0x00000002, ret+24(FP) + RET + + // Return with match offset too long error +error_match_off_too_big: + MOVQ ctx+16(FP), AX + MOVQ 8(SP), CX + MOVQ CX, 224(AX) + MOVQ R11, 136(AX) + MOVQ $0x00000003, ret+24(FP) + RET + + // Return with not enough literals error +error_not_enough_literals: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ $0x00000004, ret+24(FP) + RET + + // Return with not enough output space error +error_not_enough_space: + MOVQ ctx+16(FP), AX + MOVQ 24(SP), CX + MOVQ CX, 208(AX) + MOVQ 16(SP), CX + MOVQ CX, 216(AX) + MOVQ R11, 136(AX) + MOVQ $0x00000005, ret+24(FP) + RET diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go new file mode 100644 index 000000000000..ac2a80d29111 --- /dev/null +++ b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go @@ -0,0 +1,237 @@ +//go:build !amd64 || appengine || !gc || noasm +// +build !amd64 appengine !gc noasm + +package zstd + +import ( + "fmt" + "io" +) + +// decode sequences from the stream with the provided history but without dictionary. +func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) { + return false, nil +} + +// decode sequences from the stream without the provided history. +func (s *sequenceDecs) decode(seqs []seqVals) error { + br := s.br + + // Grab full sizes tables, to avoid bounds checks. + llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize] + llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state + s.seqSize = 0 + litRemain := len(s.literals) + + maxBlockSize := maxCompressedBlockSize + if s.windowSize < maxBlockSize { + maxBlockSize = s.windowSize + } + for i := range seqs { + var ll, mo, ml int + if br.off > 4+((maxOffsetBits+16+16)>>3) { + // inlined function: + // ll, mo, ml = s.nextFast(br, llState, mlState, ofState) + + // Final will not read from stream. + var llB, mlB, moB uint8 + ll, llB = llState.final() + ml, mlB = mlState.final() + mo, moB = ofState.final() + + // extra bits are stored in reverse order. + br.fillFast() + mo += br.getBits(moB) + if s.maxBits > 32 { + br.fillFast() + } + ml += br.getBits(mlB) + ll += br.getBits(llB) + + if moB > 1 { + s.prevOffset[2] = s.prevOffset[1] + s.prevOffset[1] = s.prevOffset[0] + s.prevOffset[0] = mo + } else { + // mo = s.adjustOffset(mo, ll, moB) + // Inlined for rather big speedup + if ll == 0 { + // There is an exception though, when current sequence's literals_length = 0. + // In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2, + // an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte. + mo++ + } + + if mo == 0 { + mo = s.prevOffset[0] + } else { + var temp int + if mo == 3 { + temp = s.prevOffset[0] - 1 + } else { + temp = s.prevOffset[mo] + } + + if temp == 0 { + // 0 is not valid; input is corrupted; force offset to 1 + println("WARNING: temp was 0") + temp = 1 + } + + if mo != 1 { + s.prevOffset[2] = s.prevOffset[1] + } + s.prevOffset[1] = s.prevOffset[0] + s.prevOffset[0] = temp + mo = temp + } + } + br.fillFast() + } else { + if br.overread() { + if debugDecoder { + printf("reading sequence %d, exceeded available data\n", i) + } + return io.ErrUnexpectedEOF + } + ll, mo, ml = s.next(br, llState, mlState, ofState) + br.fill() + } + + if debugSequences { + println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml) + } + // Evaluate. + // We might be doing this async, so do it early. + if mo == 0 && ml > 0 { + return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml) + } + if ml > maxMatchLen { + return fmt.Errorf("match len (%d) bigger than max allowed length", ml) + } + s.seqSize += ll + ml + if s.seqSize > maxBlockSize { + return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) + } + litRemain -= ll + if litRemain < 0 { + return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll) + } + seqs[i] = seqVals{ + ll: ll, + ml: ml, + mo: mo, + } + if i == len(seqs)-1 { + // This is the last sequence, so we shouldn't update state. + break + } + + // Manually inlined, ~ 5-20% faster + // Update all 3 states at once. Approx 20% faster. + nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits() + if nBits == 0 { + llState = llTable[llState.newState()&maxTableMask] + mlState = mlTable[mlState.newState()&maxTableMask] + ofState = ofTable[ofState.newState()&maxTableMask] + } else { + bits := br.get32BitsFast(nBits) + lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31)) + llState = llTable[(llState.newState()+lowBits)&maxTableMask] + + lowBits = uint16(bits >> (ofState.nbBits() & 31)) + lowBits &= bitMask[mlState.nbBits()&15] + mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask] + + lowBits = uint16(bits) & bitMask[ofState.nbBits()&15] + ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask] + } + } + s.seqSize += litRemain + if s.seqSize > maxBlockSize { + return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize) + } + err := br.close() + if err != nil { + printf("Closing sequences: %v, %+v\n", err, *br) + } + return err +} + +// executeSimple handles cases when a dictionary is not used. +func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error { + // Ensure we have enough output size... + if len(s.out)+s.seqSize > cap(s.out) { + addBytes := s.seqSize + len(s.out) + s.out = append(s.out, make([]byte, addBytes)...) + s.out = s.out[:len(s.out)-addBytes] + } + + if debugDecoder { + printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize) + } + + var t = len(s.out) + out := s.out[:t+s.seqSize] + + for _, seq := range seqs { + // Add literals + copy(out[t:], s.literals[:seq.ll]) + t += seq.ll + s.literals = s.literals[seq.ll:] + + // Malformed input + if seq.mo > t+len(hist) || seq.mo > s.windowSize { + return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist)) + } + + // Copy from history. + if v := seq.mo - t; v > 0 { + // v is the start position in history from end. + start := len(hist) - v + if seq.ml > v { + // Some goes into the current block. + // Copy remainder of history + copy(out[t:], hist[start:]) + t += v + seq.ml -= v + } else { + copy(out[t:], hist[start:start+seq.ml]) + t += seq.ml + continue + } + } + + // We must be in the current buffer now + if seq.ml > 0 { + start := t - seq.mo + if seq.ml <= t-start { + // No overlap + copy(out[t:], out[start:start+seq.ml]) + t += seq.ml + } else { + // Overlapping copy + // Extend destination slice and copy one byte at the time. + src := out[start : start+seq.ml] + dst := out[t:] + dst = dst[:len(src)] + t += len(src) + // Destination is the space we just added. + for i := range src { + dst[i] = src[i] + } + } + } + } + // Add final literals + copy(out[t:], s.literals) + if debugDecoder { + t += len(s.literals) + if t != len(out) { + panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize)) + } + } + s.out = out + + return nil +} diff --git a/vendor/github.com/klauspost/compress/zstd/zip.go b/vendor/github.com/klauspost/compress/zstd/zip.go index 967f29b3120e..29c15c8c4efe 100644 --- a/vendor/github.com/klauspost/compress/zstd/zip.go +++ b/vendor/github.com/klauspost/compress/zstd/zip.go @@ -18,36 +18,58 @@ const ZipMethodWinZip = 93 // See https://pkware.cachefly.net/webdocs/APPNOTE/APPNOTE-6.3.9.TXT const ZipMethodPKWare = 20 -var zipReaderPool sync.Pool +// zipReaderPool is the default reader pool. +var zipReaderPool = sync.Pool{New: func() interface{} { + z, err := NewReader(nil, WithDecoderLowmem(true), WithDecoderMaxWindow(128<<20), WithDecoderConcurrency(1)) + if err != nil { + panic(err) + } + return z +}} -// newZipReader cannot be used since we would leak goroutines... -func newZipReader(r io.Reader) io.ReadCloser { - dec, ok := zipReaderPool.Get().(*Decoder) - if ok { - dec.Reset(r) - } else { - d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true)) - if err != nil { - panic(err) +// newZipReader creates a pooled zip decompressor. +func newZipReader(opts ...DOption) func(r io.Reader) io.ReadCloser { + pool := &zipReaderPool + if len(opts) > 0 { + opts = append([]DOption{WithDecoderLowmem(true), WithDecoderMaxWindow(128 << 20)}, opts...) + // Force concurrency 1 + opts = append(opts, WithDecoderConcurrency(1)) + // Create our own pool + pool = &sync.Pool{} + } + return func(r io.Reader) io.ReadCloser { + dec, ok := pool.Get().(*Decoder) + if ok { + dec.Reset(r) + } else { + d, err := NewReader(r, opts...) + if err != nil { + panic(err) + } + dec = d } - dec = d + return &pooledZipReader{dec: dec, pool: pool} } - return &pooledZipReader{dec: dec} } type pooledZipReader struct { - mu sync.Mutex // guards Close and Read - dec *Decoder + mu sync.Mutex // guards Close and Read + pool *sync.Pool + dec *Decoder } func (r *pooledZipReader) Read(p []byte) (n int, err error) { r.mu.Lock() defer r.mu.Unlock() if r.dec == nil { - return 0, errors.New("Read after Close") + return 0, errors.New("read after close or EOF") } dec, err := r.dec.Read(p) - + if err == io.EOF { + r.dec.Reset(nil) + r.pool.Put(r.dec) + r.dec = nil + } return dec, err } @@ -57,7 +79,7 @@ func (r *pooledZipReader) Close() error { var err error if r.dec != nil { err = r.dec.Reset(nil) - zipReaderPool.Put(r.dec) + r.pool.Put(r.dec) r.dec = nil } return err @@ -111,12 +133,9 @@ func ZipCompressor(opts ...EOption) func(w io.Writer) (io.WriteCloser, error) { // ZipDecompressor returns a decompressor that can be registered with zip libraries. // See ZipCompressor for example. -func ZipDecompressor() func(r io.Reader) io.ReadCloser { - return func(r io.Reader) io.ReadCloser { - d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true)) - if err != nil { - panic(err) - } - return d.IOReadCloser() - } +// Options can be specified. WithDecoderConcurrency(1) is forced, +// and by default a 128MB maximum decompression window is specified. +// The window size can be overridden if required. +func ZipDecompressor(opts ...DOption) func(r io.Reader) io.ReadCloser { + return newZipReader(opts...) } diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go index ef1d49a009cc..3eb3f1c82661 100644 --- a/vendor/github.com/klauspost/compress/zstd/zstd.go +++ b/vendor/github.com/klauspost/compress/zstd/zstd.go @@ -39,6 +39,9 @@ const zstdMinMatch = 3 // Reset the buffer offset when reaching this. const bufferReset = math.MaxInt32 - MaxWindowSize +// fcsUnknown is used for unknown frame content size. +const fcsUnknown = math.MaxUint64 + var ( // ErrReservedBlockType is returned when a reserved block type is found. // Typically this indicates wrong or corrupted input. @@ -52,6 +55,10 @@ var ( // Typically returned on invalid input. ErrBlockTooSmall = errors.New("block too small") + // ErrUnexpectedBlockSize is returned when a block has unexpected size. + // Typically returned on invalid input. + ErrUnexpectedBlockSize = errors.New("unexpected block size") + // ErrMagicMismatch is returned when a "magic" number isn't what is expected. // Typically this indicates wrong or corrupted input. ErrMagicMismatch = errors.New("invalid input: magic number mismatch") @@ -75,6 +82,10 @@ var ( // This is only returned if SingleSegment is specified on the frame. ErrFrameSizeExceeded = errors.New("frame size exceeded") + // ErrFrameSizeMismatch is returned if the stated frame size does not match the expected size. + // This is only returned if SingleSegment is specified on the frame. + ErrFrameSizeMismatch = errors.New("frame size does not match size on stream") + // ErrCRCMismatch is returned if CRC mismatches. ErrCRCMismatch = errors.New("CRC check failed") @@ -99,17 +110,6 @@ func printf(format string, a ...interface{}) { } } -// matchLenFast does matching, but will not match the last up to 7 bytes. -func matchLenFast(a, b []byte) int { - endI := len(a) & (math.MaxInt32 - 7) - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - return i + bits.TrailingZeros64(diff)>>3 - } - } - return endI -} - // matchLen returns the maximum length. // a must be the shortest of the two. // The function also returns whether all bytes matched. diff --git a/vendor/modules.txt b/vendor/modules.txt index 857d89bf4a76..c0d166425461 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -713,13 +713,14 @@ github.com/json-iterator/go # github.com/julienschmidt/httprouter v1.3.0 ## explicit; go 1.7 github.com/julienschmidt/httprouter -# github.com/klauspost/compress v1.14.1 -## explicit; go 1.15 +# github.com/klauspost/compress v1.15.11 +## explicit; go 1.17 github.com/klauspost/compress github.com/klauspost/compress/flate github.com/klauspost/compress/fse github.com/klauspost/compress/gzip github.com/klauspost/compress/huff0 +github.com/klauspost/compress/internal/cpuinfo github.com/klauspost/compress/internal/snapref github.com/klauspost/compress/s2 github.com/klauspost/compress/zstd