From cf04d61198af7cf50198a3c5a8ed6d1f35fa3523 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Wed, 15 Jun 2022 15:56:33 +0200 Subject: [PATCH 1/7] s2: Add repeat checks to better mode Improves compression ~0.1% at ~10% speed cost. Seems too little to keep. More benchmarking needed. --- s2/_generate/gen.go | 39 +- s2/encode_better.go | 29 +- s2/encodeblock_amd64.go | 1 - s2/encodeblock_amd64.s | 1663 ++++++++++++++++++++++++++++++++++++--- 4 files changed, 1594 insertions(+), 138 deletions(-) diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index ffb0ce4bc5..c1e374a185 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -998,10 +998,22 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash MOVL(s, sTab.Idx(hash1, 4)) } + if !o.snappy { + // If we have at least 8 bytes match, choose that first. + CMPQ(Mem{Base: src, Index: candidate, Scale: 1}, cv.As64()) + JEQ(LabelRef("candidate_match_" + name)) + } + // En/disable repeat matching. - if false { + if !o.snappy { + { + CMPL(repeatL, U8(0)) + JEQ(LabelRef("no_repeat_found_" + name)) + } // Check repeat at offset checkRep const checkRep = 1 + const wantRepeatBytes = 6 + const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep) { // rep = s - repeat rep := GP32() @@ -1010,10 +1022,13 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash // if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { left, right := GP64(), GP64() - MOVL(Mem{Base: src, Index: rep, Disp: checkRep, Scale: 1}, right.As32()) + MOVQ(Mem{Base: src, Index: rep, Disp: 0, Scale: 1}, right.As64()) MOVQ(cv, left) - SHRQ(U8(checkRep*8), left) - CMPL(left.As32(), right.As32()) + tmp := GP64() + MOVQ(U64(repeatMask), tmp) + ANDQ(tmp, left) + ANDQ(tmp, right) + CMPQ(left.As64(), right.As64()) // BAIL, no repeat. JNE(LabelRef("no_repeat_found_" + name)) } @@ -1057,7 +1072,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash // Extend forward { // s += 4 + checkRep - ADDL(U8(4+checkRep), s) + ADDL(U8(wantRepeatBytes+checkRep), s) if true { // candidate := s - repeat + 4 + checkRep @@ -1097,18 +1112,8 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash offsetVal := GP32() MOVL(repeatL, offsetVal) - if !o.snappy { - // if nextEmit == 0 {do copy instead...} - TESTL(nextEmit, nextEmit) - JZ(LabelRef("repeat_as_copy_" + name)) - - // Emit as repeat... - o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name), false) - - // Emit as copy instead... - Label("repeat_as_copy_" + name) - } - o.emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name)) + // Emit as repeat... + o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name), false) Label("repeat_end_emit_" + name) // Store new dst and nextEmit diff --git a/s2/encode_better.go b/s2/encode_better.go index 943215b8ae..48e5d1d3c9 100644 --- a/s2/encode_better.go +++ b/s2/encode_better.go @@ -97,9 +97,20 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { lTable[hashL] = uint32(s) sTable[hashS] = uint32(s) + valLong := load64(src, candidateL) + // If we have at least 8 bytes match, choose that first. + if cv == valLong { + break + } + // Check repeat at offset checkRep. const checkRep = 1 - if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { + // Minimum length of a repeat. Tested with various values. + // While 4-5 offers improvements in some, 6 reduces + // regressions significantly. + const wantRepeatBytes = 6 + const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep) + if repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask { base := s + checkRep // Extend back for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { @@ -109,8 +120,8 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { d += emitLiteral(dst[d:], src[nextEmit:base]) // Extend forward - candidate := s - repeat + 4 + checkRep - s += 4 + checkRep + candidate := s - repeat + wantRepeatBytes + checkRep + s += wantRepeatBytes + checkRep for s < len(src) { if len(src)-s < 8 { if src[s] == src[candidate] { @@ -127,13 +138,8 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { s += 8 candidate += 8 } - if nextEmit > 0 { - // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. - d += emitRepeat(dst[d:], repeat, s-base) - } else { - // First match, cannot be repeat. - d += emitCopy(dst[d:], repeat, s-base) - } + // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. + d += emitRepeat(dst[d:], repeat, s-base) nextEmit = s if s >= sLimit { goto emitRemainder @@ -143,7 +149,8 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { continue } - if uint32(cv) == load32(src, candidateL) { + // If long matches at least 4 bytes, use that. + if uint32(cv) == uint32(valLong) { break } diff --git a/s2/encodeblock_amd64.go b/s2/encodeblock_amd64.go index 88f27c0990..d10f60ae2c 100644 --- a/s2/encodeblock_amd64.go +++ b/s2/encodeblock_amd64.go @@ -1,7 +1,6 @@ // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. //go:build !appengine && !noasm && gc && !noasm -// +build !appengine,!noasm,gc,!noasm package s2 diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index 36915d9495..2ae26c1a03 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -1,7 +1,6 @@ // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. //go:build !appengine && !noasm && gc && !noasm -// +build !appengine,!noasm,gc,!noasm #include "textflag.h" @@ -5805,12 +5804,331 @@ check_maxskip_cont_encodeBetterBlockAsm: MOVL 262168(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI + CMPQ (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm + CMPL 16(SP), $0x00 + JEQ no_repeat_found_encodeBetterBlockAsm + MOVL CX, R10 + SUBL 16(SP), R10 + MOVQ (DX)(R10*1), R11 + MOVQ DI, R10 + MOVQ $0x00ffffffffffff00, R12 + ANDQ R12, R10 + ANDQ R12, R11 + CMPQ R10, R11 + JNE no_repeat_found_encodeBetterBlockAsm + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeBetterBlockAsm + +repeat_extend_back_loop_encodeBetterBlockAsm: + CMPL DI, SI + JLE repeat_extend_back_end_encodeBetterBlockAsm + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeBetterBlockAsm + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeBetterBlockAsm + +repeat_extend_back_end_encodeBetterBlockAsm: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeBetterBlockAsm + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeBetterBlockAsm + CMPL SI, $0x00010000 + JLT three_bytes_repeat_emit_encodeBetterBlockAsm + CMPL SI, $0x01000000 + JLT four_bytes_repeat_emit_encodeBetterBlockAsm + MOVB $0xfc, (AX) + MOVL SI, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +four_bytes_repeat_emit_encodeBetterBlockAsm: + MOVL SI, R10 + SHRL $0x10, R10 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R10, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +three_bytes_repeat_emit_encodeBetterBlockAsm: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +two_bytes_repeat_emit_encodeBetterBlockAsm: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeBetterBlockAsm + JMP memmove_long_repeat_emit_encodeBetterBlockAsm + +one_byte_repeat_emit_encodeBetterBlockAsm: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBetterBlockAsm: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x04 + JLE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4: + MOVL (R9), R10 + MOVL R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeBetterBlockAsm: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm + +memmove_long_repeat_emit_encodeBetterBlockAsm: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeBetterBlockAsm: + ADDL $0x07, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_match4_repeat_extend_encodeBetterBlockAsm + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBetterBlockAsm + +matchlen_loop_repeat_extend_encodeBetterBlockAsm: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm + JZ repeat_extend_forward_end_encodeBetterBlockAsm + +matchlen_match4_repeat_extend_encodeBetterBlockAsm: + CMPL R8, $0x04 + JL matchlen_match2_repeat_extend_encodeBetterBlockAsm + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm + SUBL $0x04, R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeBetterBlockAsm: + CMPL R8, $0x02 + JL matchlen_match1_repeat_extend_encodeBetterBlockAsm + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm + SUBL $0x02, R8 + LEAL 2(R11), R11 + +matchlen_match1_repeat_extend_encodeBetterBlockAsm: + CMPL R8, $0x01 + JL repeat_extend_forward_end_encodeBetterBlockAsm + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBetterBlockAsm + LEAL 1(R11), R11 + +repeat_extend_forward_end_encodeBetterBlockAsm: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitRepeat +emit_repeat_again_match_repeat_encodeBetterBlockAsm: + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_match_repeat_encodeBetterBlockAsm + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm + CMPL DI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm + +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm: + CMPL SI, $0x00000104 + JLT repeat_three_match_repeat_encodeBetterBlockAsm + CMPL SI, $0x00010100 + JLT repeat_four_match_repeat_encodeBetterBlockAsm + CMPL SI, $0x0100ffff + JLT repeat_five_match_repeat_encodeBetterBlockAsm + LEAL -16842747(SI), SI + MOVW $0x001d, (AX) + MOVW $0xfffb, 2(AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_match_repeat_encodeBetterBlockAsm + +repeat_five_match_repeat_encodeBetterBlockAsm: + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_four_match_repeat_encodeBetterBlockAsm: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_three_match_repeat_encodeBetterBlockAsm: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_match_repeat_encodeBetterBlockAsm: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm + +repeat_two_offset_match_repeat_encodeBetterBlockAsm: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + +repeat_end_emit_encodeBetterBlockAsm: + MOVL CX, 12(SP) + JMP search_loop_encodeBetterBlockAsm + +no_repeat_found_encodeBetterBlockAsm: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeBetterBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm candidateS_match_encodeBetterBlockAsm: SHRQ $0x08, DI @@ -6877,29 +7195,329 @@ check_maxskip_cont_encodeBetterBlockAsm4MB: MOVL 262168(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI + CMPQ (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm4MB - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm4MB - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm4MB - -candidateS_match_encodeBetterBlockAsm4MB: - SHRQ $0x08, DI + CMPL 16(SP), $0x00 + JEQ no_repeat_found_encodeBetterBlockAsm4MB + MOVL CX, R10 + SUBL 16(SP), R10 + MOVQ (DX)(R10*1), R11 MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm4MB - DECL CX - MOVL R8, SI + MOVQ $0x00ffffffffffff00, R12 + ANDQ R12, R10 + ANDQ R12, R11 + CMPQ R10, R11 + JNE no_repeat_found_encodeBetterBlockAsm4MB + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeBetterBlockAsm4MB -candidate_match_encodeBetterBlockAsm4MB: - MOVL 12(SP), DI +repeat_extend_back_loop_encodeBetterBlockAsm4MB: + CMPL DI, SI + JLE repeat_extend_back_end_encodeBetterBlockAsm4MB + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeBetterBlockAsm4MB + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeBetterBlockAsm4MB + +repeat_extend_back_end_encodeBetterBlockAsm4MB: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm4MB + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeBetterBlockAsm4MB + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeBetterBlockAsm4MB + CMPL SI, $0x00010000 + JLT three_bytes_repeat_emit_encodeBetterBlockAsm4MB + MOVL SI, R10 + SHRL $0x10, R10 + MOVB $0xf8, (AX) + MOVW SI, 1(AX) + MOVB R10, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm4MB + +three_bytes_repeat_emit_encodeBetterBlockAsm4MB: + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm4MB + +two_bytes_repeat_emit_encodeBetterBlockAsm4MB: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeBetterBlockAsm4MB + JMP memmove_long_repeat_emit_encodeBetterBlockAsm4MB + +one_byte_repeat_emit_encodeBetterBlockAsm4MB: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBetterBlockAsm4MB: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x04 + JLE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_4 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_4: + MOVL (R9), R10 + MOVL R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm4MB + +memmove_long_repeat_emit_encodeBetterBlockAsm4MB: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeBetterBlockAsm4MB: + ADDL $0x07, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_match4_repeat_extend_encodeBetterBlockAsm4MB + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm4MB: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm4MB + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBetterBlockAsm4MB + +matchlen_loop_repeat_extend_encodeBetterBlockAsm4MB: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm4MB + JZ repeat_extend_forward_end_encodeBetterBlockAsm4MB + +matchlen_match4_repeat_extend_encodeBetterBlockAsm4MB: + CMPL R8, $0x04 + JL matchlen_match2_repeat_extend_encodeBetterBlockAsm4MB + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm4MB + SUBL $0x04, R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeBetterBlockAsm4MB: + CMPL R8, $0x02 + JL matchlen_match1_repeat_extend_encodeBetterBlockAsm4MB + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm4MB + SUBL $0x02, R8 + LEAL 2(R11), R11 + +matchlen_match1_repeat_extend_encodeBetterBlockAsm4MB: + CMPL R8, $0x01 + JL repeat_extend_forward_end_encodeBetterBlockAsm4MB + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBetterBlockAsm4MB + LEAL 1(R11), R11 + +repeat_extend_forward_end_encodeBetterBlockAsm4MB: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_match_repeat_encodeBetterBlockAsm4MB + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB + CMPL DI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB + +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB: + CMPL SI, $0x00000104 + JLT repeat_three_match_repeat_encodeBetterBlockAsm4MB + CMPL SI, $0x00010100 + JLT repeat_four_match_repeat_encodeBetterBlockAsm4MB + LEAL -65536(SI), SI + MOVL SI, DI + MOVW $0x001d, (AX) + MOVW SI, 2(AX) + SARL $0x10, DI + MOVB DI, 4(AX) + ADDQ $0x05, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_four_match_repeat_encodeBetterBlockAsm4MB: + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_three_match_repeat_encodeBetterBlockAsm4MB: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_two_match_repeat_encodeBetterBlockAsm4MB: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm4MB + +repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + +repeat_end_emit_encodeBetterBlockAsm4MB: + MOVL CX, 12(SP) + JMP search_loop_encodeBetterBlockAsm4MB + +no_repeat_found_encodeBetterBlockAsm4MB: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm4MB + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeBetterBlockAsm4MB + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm4MB + +candidateS_match_encodeBetterBlockAsm4MB: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm4MB + DECL CX + MOVL R8, SI + +candidate_match_encodeBetterBlockAsm4MB: + MOVL 12(SP), DI TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm4MB @@ -7871,34 +8489,311 @@ search_loop_encodeBetterBlockAsm12B: MOVL 65560(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 65560(SP)(R11*4) - CMPL (DX)(SI*1), DI + CMPQ (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm12B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm12B - -candidateS_match_encodeBetterBlockAsm12B: - SHRQ $0x08, DI + CMPL 16(SP), $0x00 + JEQ no_repeat_found_encodeBetterBlockAsm12B + MOVL CX, R10 + SUBL 16(SP), R10 + MOVQ (DX)(R10*1), R11 MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm12B - DECL CX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm12B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm12B + MOVQ $0x00ffffffffffff00, R12 + ANDQ R12, R10 + ANDQ R12, R11 + CMPQ R10, R11 + JNE no_repeat_found_encodeBetterBlockAsm12B + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeBetterBlockAsm12B -match_extend_back_loop_encodeBetterBlockAsm12B: - CMPL CX, DI +repeat_extend_back_loop_encodeBetterBlockAsm12B: + CMPL DI, SI + JLE repeat_extend_back_end_encodeBetterBlockAsm12B + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeBetterBlockAsm12B + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeBetterBlockAsm12B + +repeat_extend_back_end_encodeBetterBlockAsm12B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm12B + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeBetterBlockAsm12B + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeBetterBlockAsm12B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm12B + +two_bytes_repeat_emit_encodeBetterBlockAsm12B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeBetterBlockAsm12B + JMP memmove_long_repeat_emit_encodeBetterBlockAsm12B + +one_byte_repeat_emit_encodeBetterBlockAsm12B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x04 + JLE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4: + MOVL (R9), R10 + MOVL R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm12B + +memmove_long_repeat_emit_encodeBetterBlockAsm12B: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeBetterBlockAsm12B: + ADDL $0x07, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_match4_repeat_extend_encodeBetterBlockAsm12B + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm12B: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm12B + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBetterBlockAsm12B + +matchlen_loop_repeat_extend_encodeBetterBlockAsm12B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm12B + JZ repeat_extend_forward_end_encodeBetterBlockAsm12B + +matchlen_match4_repeat_extend_encodeBetterBlockAsm12B: + CMPL R8, $0x04 + JL matchlen_match2_repeat_extend_encodeBetterBlockAsm12B + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm12B + SUBL $0x04, R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeBetterBlockAsm12B: + CMPL R8, $0x02 + JL matchlen_match1_repeat_extend_encodeBetterBlockAsm12B + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm12B + SUBL $0x02, R8 + LEAL 2(R11), R11 + +matchlen_match1_repeat_extend_encodeBetterBlockAsm12B: + CMPL R8, $0x01 + JL repeat_extend_forward_end_encodeBetterBlockAsm12B + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBetterBlockAsm12B + LEAL 1(R11), R11 + +repeat_extend_forward_end_encodeBetterBlockAsm12B: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_match_repeat_encodeBetterBlockAsm12B + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm12B + CMPL DI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm12B + +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm12B: + CMPL SI, $0x00000104 + JLT repeat_three_match_repeat_encodeBetterBlockAsm12B + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_three_match_repeat_encodeBetterBlockAsm12B: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_two_match_repeat_encodeBetterBlockAsm12B: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm12B + +repeat_two_offset_match_repeat_encodeBetterBlockAsm12B: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + +repeat_end_emit_encodeBetterBlockAsm12B: + MOVL CX, 12(SP) + JMP search_loop_encodeBetterBlockAsm12B + +no_repeat_found_encodeBetterBlockAsm12B: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm12B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeBetterBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm12B + +candidateS_match_encodeBetterBlockAsm12B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm12B + DECL CX + MOVL R8, SI + +candidate_match_encodeBetterBlockAsm12B: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBetterBlockAsm12B + +match_extend_back_loop_encodeBetterBlockAsm12B: + CMPL CX, DI JLE match_extend_back_end_encodeBetterBlockAsm12B MOVB -1(DX)(SI*1), BL MOVB -1(DX)(CX*1), R8 @@ -8659,60 +9554,337 @@ TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 LEAQ 24(SP), DX PXOR X0, X0 -zero_loop_encodeBetterBlockAsm10B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), SI - MOVL SI, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX +zero_loop_encodeBetterBlockAsm10B: + MOVOU X0, (DX) + MOVOU X0, 16(DX) + MOVOU X0, 32(DX) + MOVOU X0, 48(DX) + MOVOU X0, 64(DX) + MOVOU X0, 80(DX) + MOVOU X0, 96(DX) + MOVOU X0, 112(DX) + ADDQ $0x80, DX + DECQ CX + JNZ zero_loop_encodeBetterBlockAsm10B + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), CX + LEAQ -6(CX), DX + LEAQ -8(CX), SI + MOVL SI, 8(SP) + SHRQ $0x05, CX + SUBL CX, DX + LEAQ (AX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, CX + MOVL $0x00000000, 16(SP) + MOVQ src_base+24(FP), DX + +search_loop_encodeBetterBlockAsm10B: + MOVL CX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 1(CX)(SI*1), SI + CMPL SI, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm10B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x36, R11 + MOVL 24(SP)(R10*4), SI + MOVL 16408(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 16408(SP)(R11*4) + CMPQ (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm10B + CMPL 16(SP), $0x00 + JEQ no_repeat_found_encodeBetterBlockAsm10B + MOVL CX, R10 + SUBL 16(SP), R10 + MOVQ (DX)(R10*1), R11 + MOVQ DI, R10 + MOVQ $0x00ffffffffffff00, R12 + ANDQ R12, R10 + ANDQ R12, R11 + CMPQ R10, R11 + JNE no_repeat_found_encodeBetterBlockAsm10B + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeBetterBlockAsm10B + +repeat_extend_back_loop_encodeBetterBlockAsm10B: + CMPL DI, SI + JLE repeat_extend_back_end_encodeBetterBlockAsm10B + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeBetterBlockAsm10B + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeBetterBlockAsm10B + +repeat_extend_back_end_encodeBetterBlockAsm10B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm10B + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeBetterBlockAsm10B + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeBetterBlockAsm10B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm10B + +two_bytes_repeat_emit_encodeBetterBlockAsm10B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeBetterBlockAsm10B + JMP memmove_long_repeat_emit_encodeBetterBlockAsm10B + +one_byte_repeat_emit_encodeBetterBlockAsm10B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x04 + JLE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4: + MOVL (R9), R10 + MOVL R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm10B + +memmove_long_repeat_emit_encodeBetterBlockAsm10B: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeBetterBlockAsm10B: + ADDL $0x07, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_match4_repeat_extend_encodeBetterBlockAsm10B + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm10B: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm10B + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBetterBlockAsm10B + +matchlen_loop_repeat_extend_encodeBetterBlockAsm10B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm10B + JZ repeat_extend_forward_end_encodeBetterBlockAsm10B + +matchlen_match4_repeat_extend_encodeBetterBlockAsm10B: + CMPL R8, $0x04 + JL matchlen_match2_repeat_extend_encodeBetterBlockAsm10B + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm10B + SUBL $0x04, R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeBetterBlockAsm10B: + CMPL R8, $0x02 + JL matchlen_match1_repeat_extend_encodeBetterBlockAsm10B + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm10B + SUBL $0x02, R8 + LEAL 2(R11), R11 + +matchlen_match1_repeat_extend_encodeBetterBlockAsm10B: + CMPL R8, $0x01 + JL repeat_extend_forward_end_encodeBetterBlockAsm10B + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBetterBlockAsm10B + LEAL 1(R11), R11 + +repeat_extend_forward_end_encodeBetterBlockAsm10B: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitRepeat + MOVL SI, R8 + LEAL -4(SI), SI + CMPL R8, $0x08 + JLE repeat_two_match_repeat_encodeBetterBlockAsm10B + CMPL R8, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm10B + CMPL DI, $0x00000800 + JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm10B + +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm10B: + CMPL SI, $0x00000104 + JLT repeat_three_match_repeat_encodeBetterBlockAsm10B + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_three_match_repeat_encodeBetterBlockAsm10B: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_two_match_repeat_encodeBetterBlockAsm10B: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm10B + +repeat_two_offset_match_repeat_encodeBetterBlockAsm10B: + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + +repeat_end_emit_encodeBetterBlockAsm10B: + MOVL CX, 12(SP) + JMP search_loop_encodeBetterBlockAsm10B -search_loop_encodeBetterBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm10B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x36, R11 - MOVL 24(SP)(R10*4), SI - MOVL 16408(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 16408(SP)(R11*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm10B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm10B +no_repeat_found_encodeBetterBlockAsm10B: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm10B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeBetterBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm10B candidateS_match_encodeBetterBlockAsm10B: SHRQ $0x08, DI @@ -9543,12 +10715,285 @@ search_loop_encodeBetterBlockAsm8B: MOVL 4120(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 4120(SP)(R11*4) - CMPL (DX)(SI*1), DI + CMPQ (DX)(SI*1), DI JEQ candidate_match_encodeBetterBlockAsm8B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm8B + CMPL 16(SP), $0x00 + JEQ no_repeat_found_encodeBetterBlockAsm8B + MOVL CX, R10 + SUBL 16(SP), R10 + MOVQ (DX)(R10*1), R11 + MOVQ DI, R10 + MOVQ $0x00ffffffffffff00, R12 + ANDQ R12, R10 + ANDQ R12, R11 + CMPQ R10, R11 + JNE no_repeat_found_encodeBetterBlockAsm8B + LEAL 1(CX), DI + MOVL 12(SP), SI + MOVL DI, R8 + SUBL 16(SP), R8 + JZ repeat_extend_back_end_encodeBetterBlockAsm8B + +repeat_extend_back_loop_encodeBetterBlockAsm8B: + CMPL DI, SI + JLE repeat_extend_back_end_encodeBetterBlockAsm8B + MOVB -1(DX)(R8*1), BL + MOVB -1(DX)(DI*1), R9 + CMPB BL, R9 + JNE repeat_extend_back_end_encodeBetterBlockAsm8B + LEAL -1(DI), DI + DECL R8 + JNZ repeat_extend_back_loop_encodeBetterBlockAsm8B + +repeat_extend_back_end_encodeBetterBlockAsm8B: + MOVL 12(SP), SI + CMPL SI, DI + JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm8B + MOVL DI, R8 + MOVL DI, 12(SP) + LEAQ (DX)(SI*1), R9 + SUBL SI, R8 + LEAL -1(R8), SI + CMPL SI, $0x3c + JLT one_byte_repeat_emit_encodeBetterBlockAsm8B + CMPL SI, $0x00000100 + JLT two_bytes_repeat_emit_encodeBetterBlockAsm8B + MOVB $0xf4, (AX) + MOVW SI, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_repeat_emit_encodeBetterBlockAsm8B + +two_bytes_repeat_emit_encodeBetterBlockAsm8B: + MOVB $0xf0, (AX) + MOVB SI, 1(AX) + ADDQ $0x02, AX + CMPL SI, $0x40 + JL memmove_repeat_emit_encodeBetterBlockAsm8B + JMP memmove_long_repeat_emit_encodeBetterBlockAsm8B + +one_byte_repeat_emit_encodeBetterBlockAsm8B: + SHLB $0x02, SI + MOVB SI, (AX) + ADDQ $0x01, AX + +memmove_repeat_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(R8*1), SI + + // genMemMoveShort + CMPQ R8, $0x04 + JLE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4 + CMPQ R8, $0x08 + JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4through7 + CMPQ R8, $0x10 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_33through64 + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4: + MOVL (R9), R10 + MOVL R10, (AX) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4through7: + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_8through16: + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_17through32: + MOVOU (R9), X0 + MOVOU -16(R9)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B + +emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_33through64: + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B: + MOVQ SI, AX + JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm8B + +memmove_long_repeat_emit_encodeBetterBlockAsm8B: + LEAQ (AX)(R8*1), SI + + // genMemMoveLong + MOVOU (R9), X0 + MOVOU 16(R9), X1 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 + MOVQ R8, R11 + SHRQ $0x05, R11 + MOVQ AX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + LEAQ -32(R9)(R12*1), R10 + LEAQ -32(AX)(R12*1), R13 + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: + MOVOU -32(R9)(R12*1), X4 + MOVOU -16(R9)(R12*1), X5 + MOVOA X4, -32(AX)(R12*1) + MOVOA X5, -16(AX)(R12*1) + ADDQ $0x20, R12 + CMPQ R8, R12 + JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ SI, AX + +emit_literal_done_repeat_emit_encodeBetterBlockAsm8B: + ADDL $0x07, CX + MOVL CX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(SI*1), SI + + // matchLen + XORL R11, R11 + CMPL R8, $0x08 + JL matchlen_match4_repeat_extend_encodeBetterBlockAsm8B + +matchlen_loopback_repeat_extend_encodeBetterBlockAsm8B: + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + TESTQ R10, R10 + JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm8B + +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 + JMP repeat_extend_forward_end_encodeBetterBlockAsm8B + +matchlen_loop_repeat_extend_encodeBetterBlockAsm8B: + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 + JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm8B + JZ repeat_extend_forward_end_encodeBetterBlockAsm8B + +matchlen_match4_repeat_extend_encodeBetterBlockAsm8B: + CMPL R8, $0x04 + JL matchlen_match2_repeat_extend_encodeBetterBlockAsm8B + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm8B + SUBL $0x04, R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeBetterBlockAsm8B: + CMPL R8, $0x02 + JL matchlen_match1_repeat_extend_encodeBetterBlockAsm8B + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm8B + SUBL $0x02, R8 + LEAL 2(R11), R11 + +matchlen_match1_repeat_extend_encodeBetterBlockAsm8B: + CMPL R8, $0x01 + JL repeat_extend_forward_end_encodeBetterBlockAsm8B + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeBetterBlockAsm8B + LEAL 1(R11), R11 + +repeat_extend_forward_end_encodeBetterBlockAsm8B: + ADDL R11, CX + MOVL CX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitRepeat + MOVL SI, DI + LEAL -4(SI), SI + CMPL DI, $0x08 + JLE repeat_two_match_repeat_encodeBetterBlockAsm8B + CMPL DI, $0x0c + JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm8B + +cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm8B: + CMPL SI, $0x00000104 + JLT repeat_three_match_repeat_encodeBetterBlockAsm8B + LEAL -256(SI), SI + MOVW $0x0019, (AX) + MOVW SI, 2(AX) + ADDQ $0x04, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + +repeat_three_match_repeat_encodeBetterBlockAsm8B: + LEAL -4(SI), SI + MOVW $0x0015, (AX) + MOVB SI, 2(AX) + ADDQ $0x03, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + +repeat_two_match_repeat_encodeBetterBlockAsm8B: + SHLL $0x02, SI + ORL $0x01, SI + MOVW SI, (AX) + ADDQ $0x02, AX + JMP repeat_end_emit_encodeBetterBlockAsm8B + XORQ R8, R8 + LEAL 1(R8)(SI*4), SI + MOVB DI, 1(AX) + SARL $0x08, DI + SHLL $0x05, DI + ORL DI, SI + MOVB SI, (AX) + ADDQ $0x02, AX + +repeat_end_emit_encodeBetterBlockAsm8B: + MOVL CX, 12(SP) + JMP search_loop_encodeBetterBlockAsm8B + +no_repeat_found_encodeBetterBlockAsm8B: + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm8B + CMPL (DX)(R8*1), DI + JEQ candidateS_match_encodeBetterBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm8B candidateS_match_encodeBetterBlockAsm8B: SHRQ $0x08, DI From 14fe46225a922eeead6060b925ae30bc653851c5 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Wed, 15 Jun 2022 18:35:29 +0200 Subject: [PATCH 2/7] Generate with 1.17 --- s2/encodeblock_amd64.go | 1 + s2/encodeblock_amd64.s | 1 + 2 files changed, 2 insertions(+) diff --git a/s2/encodeblock_amd64.go b/s2/encodeblock_amd64.go index d10f60ae2c..88f27c0990 100644 --- a/s2/encodeblock_amd64.go +++ b/s2/encodeblock_amd64.go @@ -1,6 +1,7 @@ // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. //go:build !appengine && !noasm && gc && !noasm +// +build !appengine,!noasm,gc,!noasm package s2 diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index 2ae26c1a03..82125aeb5e 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -1,6 +1,7 @@ // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. //go:build !appengine && !noasm && gc && !noasm +// +build !appengine,!noasm,gc,!noasm #include "textflag.h" From 20f1d1e5a34467661e9cb8e772f4d4632f7ef49c Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 20 Jun 2022 13:36:52 +0200 Subject: [PATCH 3/7] s2: Improve compression on better Use bigger long match table and check candidates at different lengths. --- s2/_generate/gen.go | 51 +- s2/encode_better.go | 16 +- s2/encodeblock_amd64.s | 1746 +++++----------------------------------- 3 files changed, 242 insertions(+), 1571 deletions(-) diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index c1e374a185..577fef2fee 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -59,12 +59,12 @@ func main() { o.outputMargin = 6 o.maxSkip = 100 // Blocks can be long, limit max skipping. - o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 16, 7, 7, limit14B) - o.genEncodeBetterBlockAsm("encodeBetterBlockAsm4MB", 16, 7, 7, 4<<20) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 17, 14, 7, 7, limit14B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm4MB", 17, 14, 7, 7, 4<<20) o.maxSkip = 0 - o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 6, 6, limit12B) - o.genEncodeBetterBlockAsm("encodeBetterBlockAsm10B", 12, 5, 6, limit10B) - o.genEncodeBetterBlockAsm("encodeBetterBlockAsm8B", 10, 4, 6, limit8B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 12, 6, 6, limit12B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm10B", 12, 10, 5, 6, limit10B) + o.genEncodeBetterBlockAsm("encodeBetterBlockAsm8B", 10, 8, 4, 6, limit8B) // Snappy compatible o.snappy = true @@ -76,12 +76,12 @@ func main() { o.genEncodeBlockAsm("encodeSnappyBlockAsm8B", 8, 4, 4, limit8B) o.maxSkip = 100 - o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm", 16, 7, 7, limit14B) + o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm", 17, 14, 7, 7, limit14B) o.maxSkip = 0 - o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm64K", 16, 7, 7, 64<<10-1) - o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm12B", 14, 6, 6, limit12B) - o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm10B", 12, 5, 6, limit10B) - o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm8B", 10, 4, 6, limit8B) + o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm64K", 16, 14, 7, 7, 64<<10-1) + o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm12B", 14, 12, 6, 6, limit12B) + o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm10B", 12, 10, 5, 6, limit10B) + o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm8B", 10, 8, 4, 6, limit8B) o.snappy = false o.outputMargin = 0 @@ -785,7 +785,7 @@ func maxLitOverheadFor(n int) int { return 5 } -func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHashBytes, maxLen int) { +func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, skipLog, lHashBytes, maxLen int) { TEXT(name, 0, "func(dst, src []byte) int") Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.", fmt.Sprintf("Maximum input %d bytes.", maxLen), @@ -797,7 +797,6 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash } var literalMaxOverhead = maxLitOverheadFor(maxLen) - var sTableBits = lTableBits - 2 const sHashBytes = 4 o.maxLen = maxLen @@ -998,14 +997,26 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash MOVL(s, sTab.Idx(hash1, 4)) } - if !o.snappy { - // If we have at least 8 bytes match, choose that first. - CMPQ(Mem{Base: src, Index: candidate, Scale: 1}, cv.As64()) - JEQ(LabelRef("candidate_match_" + name)) - } + longVal := GP64() + shortVal := GP64() + MOVQ(Mem{Base: src, Index: candidate, Scale: 1}, longVal) + MOVQ(Mem{Base: src, Index: candidateS, Scale: 1}, shortVal) + + // If we have at least 8 bytes match, choose that first. + CMPQ(longVal, cv.As64()) + JEQ(LabelRef("candidate_match_" + name)) + + CMPQ(shortVal, cv.As64()) + JNE(LabelRef("no_short_found_" + name)) + MOVL(candidateS.As32(), candidate.As32()) + JMP(LabelRef("candidate_match_" + name)) + + Label("no_short_found_" + name) + MOVL(longVal.As32(), longVal.As32()) // En/disable repeat matching. - if !o.snappy { + // Too small improvement + if false { { CMPL(repeatL, U8(0)) JEQ(LabelRef("no_repeat_found_" + name)) @@ -1150,11 +1161,11 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash JG(ok) }) - CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) + CMPL(longVal.As32(), cv.As32()) JEQ(LabelRef("candidate_match_" + name)) //if uint32(cv) == load32(src, candidateS) - CMPL(Mem{Base: src, Index: candidateS, Scale: 1}, cv.As32()) + CMPL(shortVal.As32(), cv.As32()) JEQ(LabelRef("candidateS_match_" + name)) // No match found, next loop diff --git a/s2/encode_better.go b/s2/encode_better.go index 48e5d1d3c9..889ffde9a5 100644 --- a/s2/encode_better.go +++ b/s2/encode_better.go @@ -56,7 +56,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { // Initialize the hash tables. const ( // Long hash matches. - lTableBits = 16 + lTableBits = 17 maxLTableSize = 1 << lTableBits // Short hash matches. @@ -98,10 +98,16 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { sTable[hashS] = uint32(s) valLong := load64(src, candidateL) - // If we have at least 8 bytes match, choose that first. + valShort := load64(src, candidateS) + + // If long matches at least 8 bytes, use that. if cv == valLong { break } + if cv == valShort { + candidateL = candidateS + break + } // Check repeat at offset checkRep. const checkRep = 1 @@ -110,7 +116,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { // regressions significantly. const wantRepeatBytes = 6 const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep) - if repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask { + if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask { base := s + checkRep // Extend back for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { @@ -149,13 +155,13 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { continue } - // If long matches at least 4 bytes, use that. + // Long likely matches 7, so take that. if uint32(cv) == uint32(valLong) { break } // Check our short candidate - if uint32(cv) == load32(src, candidateS) { + if uint32(cv) == uint32(valShort) { // Try a long candidate at s+1 hashL = hash7(cv>>8, lTableBits) candidateL = int(lTable[hashL]) diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index 82125aeb5e..a12c5918ff 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -5743,9 +5743,9 @@ emit_literal_done_emit_remainder_encodeBlockAsm8B: // func encodeBetterBlockAsm(dst []byte, src []byte) int // Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm(SB), $327704-56 +TEXT ·encodeBetterBlockAsm(SB), $589848-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX + MOVQ $0x00001200, CX LEAQ 24(SP), DX PXOR X0, X0 @@ -5797,336 +5797,27 @@ check_maxskip_cont_encodeBetterBlockAsm: MOVQ DI, R11 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 + MOVL 524312(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPQ (DX)(SI*1), DI + MOVL CX, 524312(SP)(R11*4) + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm - CMPL 16(SP), $0x00 - JEQ no_repeat_found_encodeBetterBlockAsm - MOVL CX, R10 - SUBL 16(SP), R10 - MOVQ (DX)(R10*1), R11 - MOVQ DI, R10 - MOVQ $0x00ffffffffffff00, R12 - ANDQ R12, R10 - ANDQ R12, R11 - CMPQ R10, R11 - JNE no_repeat_found_encodeBetterBlockAsm - LEAL 1(CX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeBetterBlockAsm - -repeat_extend_back_loop_encodeBetterBlockAsm: - CMPL DI, SI - JLE repeat_extend_back_end_encodeBetterBlockAsm - MOVB -1(DX)(R8*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeBetterBlockAsm - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeBetterBlockAsm - -repeat_extend_back_end_encodeBetterBlockAsm: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeBetterBlockAsm - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeBetterBlockAsm - CMPL SI, $0x00010000 - JLT three_bytes_repeat_emit_encodeBetterBlockAsm - CMPL SI, $0x01000000 - JLT four_bytes_repeat_emit_encodeBetterBlockAsm - MOVB $0xfc, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm - -four_bytes_repeat_emit_encodeBetterBlockAsm: - MOVL SI, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm - -three_bytes_repeat_emit_encodeBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm - -two_bytes_repeat_emit_encodeBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeBetterBlockAsm - JMP memmove_long_repeat_emit_encodeBetterBlockAsm - -one_byte_repeat_emit_encodeBetterBlockAsm: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBetterBlockAsm: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x04 - JLE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeBetterBlockAsm: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm - -memmove_long_repeat_emit_encodeBetterBlockAsm: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeBetterBlockAsm: - ADDL $0x07, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R11, R11 - CMPL R8, $0x08 - JL matchlen_match4_repeat_extend_encodeBetterBlockAsm - -matchlen_loopback_repeat_extend_encodeBetterBlockAsm: - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm - -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBetterBlockAsm - -matchlen_loop_repeat_extend_encodeBetterBlockAsm: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm - JZ repeat_extend_forward_end_encodeBetterBlockAsm - -matchlen_match4_repeat_extend_encodeBetterBlockAsm: - CMPL R8, $0x04 - JL matchlen_match2_repeat_extend_encodeBetterBlockAsm - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm - SUBL $0x04, R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBetterBlockAsm: - CMPL R8, $0x02 - JL matchlen_match1_repeat_extend_encodeBetterBlockAsm - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm - SUBL $0x02, R8 - LEAL 2(R11), R11 - -matchlen_match1_repeat_extend_encodeBetterBlockAsm: - CMPL R8, $0x01 - JL repeat_extend_forward_end_encodeBetterBlockAsm - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBetterBlockAsm - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBetterBlockAsm: - ADDL R11, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitRepeat -emit_repeat_again_match_repeat_encodeBetterBlockAsm: - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_match_repeat_encodeBetterBlockAsm - CMPL R8, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm - CMPL DI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm - -cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm: - CMPL SI, $0x00000104 - JLT repeat_three_match_repeat_encodeBetterBlockAsm - CMPL SI, $0x00010100 - JLT repeat_four_match_repeat_encodeBetterBlockAsm - CMPL SI, $0x0100ffff - JLT repeat_five_match_repeat_encodeBetterBlockAsm - LEAL -16842747(SI), SI - MOVW $0x001d, (AX) - MOVW $0xfffb, 2(AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_repeat_encodeBetterBlockAsm - -repeat_five_match_repeat_encodeBetterBlockAsm: - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (AX) - MOVW SI, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBetterBlockAsm - -repeat_four_match_repeat_encodeBetterBlockAsm: - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBetterBlockAsm - -repeat_three_match_repeat_encodeBetterBlockAsm: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBetterBlockAsm - -repeat_two_match_repeat_encodeBetterBlockAsm: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm - -repeat_two_offset_match_repeat_encodeBetterBlockAsm: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - -repeat_end_emit_encodeBetterBlockAsm: - MOVL CX, 12(SP) - JMP search_loop_encodeBetterBlockAsm + CMPQ R11, DI + JNE no_short_found_encodeBetterBlockAsm + MOVL R8, SI + JMP candidate_match_encodeBetterBlockAsm -no_repeat_found_encodeBetterBlockAsm: - CMPL (DX)(SI*1), DI +no_short_found_encodeBetterBlockAsm: + CMPL R10, DI JEQ candidate_match_encodeBetterBlockAsm - CMPL (DX)(R8*1), DI + CMPL R11, DI JEQ candidateS_match_encodeBetterBlockAsm MOVL 20(SP), CX JMP search_loop_encodeBetterBlockAsm @@ -6136,7 +5827,7 @@ candidateS_match_encodeBetterBlockAsm: MOVQ DI, R10 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 MOVL 24(SP)(R10*4), SI INCL CX MOVL CX, 24(SP)(R10*4) @@ -6922,10 +6613,10 @@ match_nolit_dst_ok_encodeBetterBlockAsm: MOVQ -2(DX)(CX*1), R9 SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 SHLQ $0x08, R13 IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 @@ -6934,8 +6625,8 @@ match_nolit_dst_ok_encodeBetterBlockAsm: SHRQ $0x32, R12 MOVL DI, 24(SP)(R10*4) MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) + MOVL R14, 524312(SP)(R11*4) + MOVL R15, 524312(SP)(R12*4) MOVQ R9, R10 MOVQ R9, R11 SHRQ $0x08, R11 @@ -6944,15 +6635,15 @@ match_nolit_dst_ok_encodeBetterBlockAsm: LEAL -1(CX), DI SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 SHLQ $0x08, R13 IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R13 MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) + MOVL DI, 524312(SP)(R11*4) MOVL DI, 24(SP)(R13*4) JMP search_loop_encodeBetterBlockAsm @@ -7134,9 +6825,9 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm: // func encodeBetterBlockAsm4MB(dst []byte, src []byte) int // Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56 +TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX + MOVQ $0x00001200, CX LEAQ 24(SP), DX PXOR X0, X0 @@ -7188,317 +6879,27 @@ check_maxskip_cont_encodeBetterBlockAsm4MB: MOVQ DI, R11 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 + MOVL 524312(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPQ (DX)(SI*1), DI + MOVL CX, 524312(SP)(R11*4) + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm4MB - CMPL 16(SP), $0x00 - JEQ no_repeat_found_encodeBetterBlockAsm4MB - MOVL CX, R10 - SUBL 16(SP), R10 - MOVQ (DX)(R10*1), R11 - MOVQ DI, R10 - MOVQ $0x00ffffffffffff00, R12 - ANDQ R12, R10 - ANDQ R12, R11 - CMPQ R10, R11 - JNE no_repeat_found_encodeBetterBlockAsm4MB - LEAL 1(CX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeBetterBlockAsm4MB - -repeat_extend_back_loop_encodeBetterBlockAsm4MB: - CMPL DI, SI - JLE repeat_extend_back_end_encodeBetterBlockAsm4MB - MOVB -1(DX)(R8*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeBetterBlockAsm4MB - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeBetterBlockAsm4MB + CMPQ R11, DI + JNE no_short_found_encodeBetterBlockAsm4MB + MOVL R8, SI + JMP candidate_match_encodeBetterBlockAsm4MB -repeat_extend_back_end_encodeBetterBlockAsm4MB: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm4MB - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeBetterBlockAsm4MB - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeBetterBlockAsm4MB - CMPL SI, $0x00010000 - JLT three_bytes_repeat_emit_encodeBetterBlockAsm4MB - MOVL SI, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm4MB - -three_bytes_repeat_emit_encodeBetterBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm4MB - -two_bytes_repeat_emit_encodeBetterBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeBetterBlockAsm4MB - JMP memmove_long_repeat_emit_encodeBetterBlockAsm4MB - -one_byte_repeat_emit_encodeBetterBlockAsm4MB: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBetterBlockAsm4MB: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x04 - JLE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeBetterBlockAsm4MB: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm4MB - -memmove_long_repeat_emit_encodeBetterBlockAsm4MB: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeBetterBlockAsm4MB: - ADDL $0x07, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R11, R11 - CMPL R8, $0x08 - JL matchlen_match4_repeat_extend_encodeBetterBlockAsm4MB - -matchlen_loopback_repeat_extend_encodeBetterBlockAsm4MB: - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm4MB - -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBetterBlockAsm4MB - -matchlen_loop_repeat_extend_encodeBetterBlockAsm4MB: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm4MB - JZ repeat_extend_forward_end_encodeBetterBlockAsm4MB - -matchlen_match4_repeat_extend_encodeBetterBlockAsm4MB: - CMPL R8, $0x04 - JL matchlen_match2_repeat_extend_encodeBetterBlockAsm4MB - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm4MB - SUBL $0x04, R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBetterBlockAsm4MB: - CMPL R8, $0x02 - JL matchlen_match1_repeat_extend_encodeBetterBlockAsm4MB - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm4MB - SUBL $0x02, R8 - LEAL 2(R11), R11 - -matchlen_match1_repeat_extend_encodeBetterBlockAsm4MB: - CMPL R8, $0x01 - JL repeat_extend_forward_end_encodeBetterBlockAsm4MB - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBetterBlockAsm4MB - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBetterBlockAsm4MB: - ADDL R11, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_match_repeat_encodeBetterBlockAsm4MB - CMPL R8, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB - CMPL DI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB - -cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB: - CMPL SI, $0x00000104 - JLT repeat_three_match_repeat_encodeBetterBlockAsm4MB - CMPL SI, $0x00010100 - JLT repeat_four_match_repeat_encodeBetterBlockAsm4MB - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (AX) - MOVW SI, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBetterBlockAsm4MB - -repeat_four_match_repeat_encodeBetterBlockAsm4MB: - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBetterBlockAsm4MB - -repeat_three_match_repeat_encodeBetterBlockAsm4MB: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBetterBlockAsm4MB - -repeat_two_match_repeat_encodeBetterBlockAsm4MB: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm4MB - -repeat_two_offset_match_repeat_encodeBetterBlockAsm4MB: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - -repeat_end_emit_encodeBetterBlockAsm4MB: - MOVL CX, 12(SP) - JMP search_loop_encodeBetterBlockAsm4MB - -no_repeat_found_encodeBetterBlockAsm4MB: - CMPL (DX)(SI*1), DI +no_short_found_encodeBetterBlockAsm4MB: + CMPL R10, DI JEQ candidate_match_encodeBetterBlockAsm4MB - CMPL (DX)(R8*1), DI + CMPL R11, DI JEQ candidateS_match_encodeBetterBlockAsm4MB MOVL 20(SP), CX JMP search_loop_encodeBetterBlockAsm4MB @@ -7508,7 +6909,7 @@ candidateS_match_encodeBetterBlockAsm4MB: MOVQ DI, R10 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 MOVL 24(SP)(R10*4), SI INCL CX MOVL CX, 24(SP)(R10*4) @@ -8232,10 +7633,10 @@ match_nolit_dst_ok_encodeBetterBlockAsm4MB: MOVQ -2(DX)(CX*1), R9 SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 SHLQ $0x08, R13 IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 @@ -8244,8 +7645,8 @@ match_nolit_dst_ok_encodeBetterBlockAsm4MB: SHRQ $0x32, R12 MOVL DI, 24(SP)(R10*4) MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) + MOVL R14, 524312(SP)(R11*4) + MOVL R15, 524312(SP)(R12*4) MOVQ R9, R10 MOVQ R9, R11 SHRQ $0x08, R11 @@ -8254,15 +7655,15 @@ match_nolit_dst_ok_encodeBetterBlockAsm4MB: LEAL -1(CX), DI SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 SHLQ $0x08, R13 IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R13 MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) + MOVL DI, 524312(SP)(R11*4) MOVL DI, 24(SP)(R13*4) JMP search_loop_encodeBetterBlockAsm4MB @@ -8490,303 +7891,36 @@ search_loop_encodeBetterBlockAsm12B: MOVL 65560(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 65560(SP)(R11*4) - CMPQ (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm12B - CMPL 16(SP), $0x00 - JEQ no_repeat_found_encodeBetterBlockAsm12B - MOVL CX, R10 - SUBL 16(SP), R10 - MOVQ (DX)(R10*1), R11 - MOVQ DI, R10 - MOVQ $0x00ffffffffffff00, R12 - ANDQ R12, R10 - ANDQ R12, R11 - CMPQ R10, R11 - JNE no_repeat_found_encodeBetterBlockAsm12B - LEAL 1(CX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeBetterBlockAsm12B - -repeat_extend_back_loop_encodeBetterBlockAsm12B: - CMPL DI, SI - JLE repeat_extend_back_end_encodeBetterBlockAsm12B - MOVB -1(DX)(R8*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeBetterBlockAsm12B - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeBetterBlockAsm12B + CMPQ R11, DI + JNE no_short_found_encodeBetterBlockAsm12B + MOVL R8, SI + JMP candidate_match_encodeBetterBlockAsm12B -repeat_extend_back_end_encodeBetterBlockAsm12B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm12B - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeBetterBlockAsm12B - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeBetterBlockAsm12B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm12B +no_short_found_encodeBetterBlockAsm12B: + CMPL R10, DI + JEQ candidate_match_encodeBetterBlockAsm12B + CMPL R11, DI + JEQ candidateS_match_encodeBetterBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm12B -two_bytes_repeat_emit_encodeBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeBetterBlockAsm12B - JMP memmove_long_repeat_emit_encodeBetterBlockAsm12B - -one_byte_repeat_emit_encodeBetterBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBetterBlockAsm12B: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x04 - JLE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm12B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeBetterBlockAsm12B: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm12B - -memmove_long_repeat_emit_encodeBetterBlockAsm12B: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeBetterBlockAsm12B: - ADDL $0x07, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R11, R11 - CMPL R8, $0x08 - JL matchlen_match4_repeat_extend_encodeBetterBlockAsm12B - -matchlen_loopback_repeat_extend_encodeBetterBlockAsm12B: - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm12B - -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBetterBlockAsm12B - -matchlen_loop_repeat_extend_encodeBetterBlockAsm12B: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm12B - JZ repeat_extend_forward_end_encodeBetterBlockAsm12B - -matchlen_match4_repeat_extend_encodeBetterBlockAsm12B: - CMPL R8, $0x04 - JL matchlen_match2_repeat_extend_encodeBetterBlockAsm12B - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm12B - SUBL $0x04, R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBetterBlockAsm12B: - CMPL R8, $0x02 - JL matchlen_match1_repeat_extend_encodeBetterBlockAsm12B - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm12B - SUBL $0x02, R8 - LEAL 2(R11), R11 - -matchlen_match1_repeat_extend_encodeBetterBlockAsm12B: - CMPL R8, $0x01 - JL repeat_extend_forward_end_encodeBetterBlockAsm12B - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBetterBlockAsm12B - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBetterBlockAsm12B: - ADDL R11, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_match_repeat_encodeBetterBlockAsm12B - CMPL R8, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm12B - CMPL DI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm12B - -cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm12B: - CMPL SI, $0x00000104 - JLT repeat_three_match_repeat_encodeBetterBlockAsm12B - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBetterBlockAsm12B - -repeat_three_match_repeat_encodeBetterBlockAsm12B: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBetterBlockAsm12B - -repeat_two_match_repeat_encodeBetterBlockAsm12B: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm12B - -repeat_two_offset_match_repeat_encodeBetterBlockAsm12B: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - -repeat_end_emit_encodeBetterBlockAsm12B: - MOVL CX, 12(SP) - JMP search_loop_encodeBetterBlockAsm12B - -no_repeat_found_encodeBetterBlockAsm12B: - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm12B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm12B - -candidateS_match_encodeBetterBlockAsm12B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R10*4), SI - INCL CX - MOVL CX, 24(SP)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm12B - DECL CX - MOVL R8, SI +candidateS_match_encodeBetterBlockAsm12B: + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R10*4), SI + INCL CX + MOVL CX, 24(SP)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm12B + DECL CX + MOVL R8, SI candidate_match_encodeBetterBlockAsm12B: MOVL 12(SP), DI @@ -9585,304 +8719,37 @@ search_loop_encodeBetterBlockAsm10B: SUBL 12(SP), SI SHRL $0x05, SI LEAL 1(CX)(SI*1), SI - CMPL SI, 8(SP) - JGE emit_remainder_encodeBetterBlockAsm10B - MOVQ (DX)(CX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x36, R11 - MOVL 24(SP)(R10*4), SI - MOVL 16408(SP)(R11*4), R8 - MOVL CX, 24(SP)(R10*4) - MOVL CX, 16408(SP)(R11*4) - CMPQ (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm10B - CMPL 16(SP), $0x00 - JEQ no_repeat_found_encodeBetterBlockAsm10B - MOVL CX, R10 - SUBL 16(SP), R10 - MOVQ (DX)(R10*1), R11 - MOVQ DI, R10 - MOVQ $0x00ffffffffffff00, R12 - ANDQ R12, R10 - ANDQ R12, R11 - CMPQ R10, R11 - JNE no_repeat_found_encodeBetterBlockAsm10B - LEAL 1(CX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeBetterBlockAsm10B - -repeat_extend_back_loop_encodeBetterBlockAsm10B: - CMPL DI, SI - JLE repeat_extend_back_end_encodeBetterBlockAsm10B - MOVB -1(DX)(R8*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeBetterBlockAsm10B - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeBetterBlockAsm10B - -repeat_extend_back_end_encodeBetterBlockAsm10B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm10B - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeBetterBlockAsm10B - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeBetterBlockAsm10B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm10B - -two_bytes_repeat_emit_encodeBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeBetterBlockAsm10B - JMP memmove_long_repeat_emit_encodeBetterBlockAsm10B - -one_byte_repeat_emit_encodeBetterBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBetterBlockAsm10B: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x04 - JLE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm10B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeBetterBlockAsm10B: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm10B - -memmove_long_repeat_emit_encodeBetterBlockAsm10B: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeBetterBlockAsm10B: - ADDL $0x07, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R11, R11 - CMPL R8, $0x08 - JL matchlen_match4_repeat_extend_encodeBetterBlockAsm10B - -matchlen_loopback_repeat_extend_encodeBetterBlockAsm10B: - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm10B - -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBetterBlockAsm10B - -matchlen_loop_repeat_extend_encodeBetterBlockAsm10B: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm10B - JZ repeat_extend_forward_end_encodeBetterBlockAsm10B - -matchlen_match4_repeat_extend_encodeBetterBlockAsm10B: - CMPL R8, $0x04 - JL matchlen_match2_repeat_extend_encodeBetterBlockAsm10B - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm10B - SUBL $0x04, R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBetterBlockAsm10B: - CMPL R8, $0x02 - JL matchlen_match1_repeat_extend_encodeBetterBlockAsm10B - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm10B - SUBL $0x02, R8 - LEAL 2(R11), R11 - -matchlen_match1_repeat_extend_encodeBetterBlockAsm10B: - CMPL R8, $0x01 - JL repeat_extend_forward_end_encodeBetterBlockAsm10B - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBetterBlockAsm10B - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBetterBlockAsm10B: - ADDL R11, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JLE repeat_two_match_repeat_encodeBetterBlockAsm10B - CMPL R8, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm10B - CMPL DI, $0x00000800 - JLT repeat_two_offset_match_repeat_encodeBetterBlockAsm10B - -cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm10B: - CMPL SI, $0x00000104 - JLT repeat_three_match_repeat_encodeBetterBlockAsm10B - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBetterBlockAsm10B - -repeat_three_match_repeat_encodeBetterBlockAsm10B: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBetterBlockAsm10B - -repeat_two_match_repeat_encodeBetterBlockAsm10B: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm10B - -repeat_two_offset_match_repeat_encodeBetterBlockAsm10B: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - -repeat_end_emit_encodeBetterBlockAsm10B: - MOVL CX, 12(SP) - JMP search_loop_encodeBetterBlockAsm10B + CMPL SI, 8(SP) + JGE emit_remainder_encodeBetterBlockAsm10B + MOVQ (DX)(CX*1), DI + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x10, R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x36, R11 + MOVL 24(SP)(R10*4), SI + MOVL 16408(SP)(R11*4), R8 + MOVL CX, 24(SP)(R10*4) + MOVL CX, 16408(SP)(R11*4) + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI + JEQ candidate_match_encodeBetterBlockAsm10B + CMPQ R11, DI + JNE no_short_found_encodeBetterBlockAsm10B + MOVL R8, SI + JMP candidate_match_encodeBetterBlockAsm10B -no_repeat_found_encodeBetterBlockAsm10B: - CMPL (DX)(SI*1), DI +no_short_found_encodeBetterBlockAsm10B: + CMPL R10, DI JEQ candidate_match_encodeBetterBlockAsm10B - CMPL (DX)(R8*1), DI + CMPL R11, DI JEQ candidateS_match_encodeBetterBlockAsm10B MOVL 20(SP), CX JMP search_loop_encodeBetterBlockAsm10B @@ -10716,282 +9583,19 @@ search_loop_encodeBetterBlockAsm8B: MOVL 4120(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 4120(SP)(R11*4) - CMPQ (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm8B - CMPL 16(SP), $0x00 - JEQ no_repeat_found_encodeBetterBlockAsm8B - MOVL CX, R10 - SUBL 16(SP), R10 - MOVQ (DX)(R10*1), R11 - MOVQ DI, R10 - MOVQ $0x00ffffffffffff00, R12 - ANDQ R12, R10 - ANDQ R12, R11 - CMPQ R10, R11 - JNE no_repeat_found_encodeBetterBlockAsm8B - LEAL 1(CX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeBetterBlockAsm8B - -repeat_extend_back_loop_encodeBetterBlockAsm8B: - CMPL DI, SI - JLE repeat_extend_back_end_encodeBetterBlockAsm8B - MOVB -1(DX)(R8*1), BL - MOVB -1(DX)(DI*1), R9 - CMPB BL, R9 - JNE repeat_extend_back_end_encodeBetterBlockAsm8B - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeBetterBlockAsm8B - -repeat_extend_back_end_encodeBetterBlockAsm8B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm8B - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (DX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JLT one_byte_repeat_emit_encodeBetterBlockAsm8B - CMPL SI, $0x00000100 - JLT two_bytes_repeat_emit_encodeBetterBlockAsm8B - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBetterBlockAsm8B - -two_bytes_repeat_emit_encodeBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, AX - CMPL SI, $0x40 - JL memmove_repeat_emit_encodeBetterBlockAsm8B - JMP memmove_long_repeat_emit_encodeBetterBlockAsm8B - -one_byte_repeat_emit_encodeBetterBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBetterBlockAsm8B: - LEAQ (AX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x04 - JLE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBetterBlockAsm8B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeBetterBlockAsm8B: - MOVQ SI, AX - JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm8B - -memmove_long_repeat_emit_encodeBetterBlockAsm8B: - LEAQ (AX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ SI, AX - -emit_literal_done_repeat_emit_encodeBetterBlockAsm8B: - ADDL $0x07, CX - MOVL CX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(SI*1), SI - - // matchLen - XORL R11, R11 - CMPL R8, $0x08 - JL matchlen_match4_repeat_extend_encodeBetterBlockAsm8B - -matchlen_loopback_repeat_extend_encodeBetterBlockAsm8B: - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBetterBlockAsm8B - -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBetterBlockAsm8B - -matchlen_loop_repeat_extend_encodeBetterBlockAsm8B: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JGE matchlen_loopback_repeat_extend_encodeBetterBlockAsm8B - JZ repeat_extend_forward_end_encodeBetterBlockAsm8B - -matchlen_match4_repeat_extend_encodeBetterBlockAsm8B: - CMPL R8, $0x04 - JL matchlen_match2_repeat_extend_encodeBetterBlockAsm8B - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm8B - SUBL $0x04, R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBetterBlockAsm8B: - CMPL R8, $0x02 - JL matchlen_match1_repeat_extend_encodeBetterBlockAsm8B - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm8B - SUBL $0x02, R8 - LEAL 2(R11), R11 - -matchlen_match1_repeat_extend_encodeBetterBlockAsm8B: - CMPL R8, $0x01 - JL repeat_extend_forward_end_encodeBetterBlockAsm8B - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBetterBlockAsm8B - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBetterBlockAsm8B: - ADDL R11, CX - MOVL CX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitRepeat - MOVL SI, DI - LEAL -4(SI), SI - CMPL DI, $0x08 - JLE repeat_two_match_repeat_encodeBetterBlockAsm8B - CMPL DI, $0x0c - JGE cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm8B - -cant_repeat_two_offset_match_repeat_encodeBetterBlockAsm8B: - CMPL SI, $0x00000104 - JLT repeat_three_match_repeat_encodeBetterBlockAsm8B - LEAL -256(SI), SI - MOVW $0x0019, (AX) - MOVW SI, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBetterBlockAsm8B - -repeat_three_match_repeat_encodeBetterBlockAsm8B: - LEAL -4(SI), SI - MOVW $0x0015, (AX) - MOVB SI, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBetterBlockAsm8B - -repeat_two_match_repeat_encodeBetterBlockAsm8B: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBetterBlockAsm8B - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - -repeat_end_emit_encodeBetterBlockAsm8B: - MOVL CX, 12(SP) - JMP search_loop_encodeBetterBlockAsm8B + CMPQ R11, DI + JNE no_short_found_encodeBetterBlockAsm8B + MOVL R8, SI + JMP candidate_match_encodeBetterBlockAsm8B -no_repeat_found_encodeBetterBlockAsm8B: - CMPL (DX)(SI*1), DI +no_short_found_encodeBetterBlockAsm8B: + CMPL R10, DI JEQ candidate_match_encodeBetterBlockAsm8B - CMPL (DX)(R8*1), DI + CMPL R11, DI JEQ candidateS_match_encodeBetterBlockAsm8B MOVL 20(SP), CX JMP search_loop_encodeBetterBlockAsm8B @@ -15733,9 +14337,9 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: // func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int // Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56 +TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX + MOVQ $0x00001200, CX LEAQ 24(SP), DX PXOR X0, X0 @@ -15787,27 +14391,37 @@ check_maxskip_cont_encodeSnappyBetterBlockAsm: MOVQ DI, R11 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 + MOVL 524312(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVL CX, 524312(SP)(R11*4) + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm + CMPQ R11, DI + JNE no_short_found_encodeSnappyBetterBlockAsm + MOVL R8, SI + JMP candidate_match_encodeSnappyBetterBlockAsm + +no_short_found_encodeSnappyBetterBlockAsm: + CMPL R10, DI + JEQ candidate_match_encodeSnappyBetterBlockAsm + CMPL R11, DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm candidateS_match_encodeSnappyBetterBlockAsm: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 MOVL 24(SP)(R10*4), SI INCL CX MOVL CX, 24(SP)(R10*4) @@ -16144,10 +14758,10 @@ match_nolit_dst_ok_encodeSnappyBetterBlockAsm: MOVQ -2(DX)(CX*1), R9 SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 SHLQ $0x08, R13 IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 @@ -16156,8 +14770,8 @@ match_nolit_dst_ok_encodeSnappyBetterBlockAsm: SHRQ $0x32, R12 MOVL DI, 24(SP)(R10*4) MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) + MOVL R14, 524312(SP)(R11*4) + MOVL R15, 524312(SP)(R12*4) MOVQ R9, R10 MOVQ R9, R11 SHRQ $0x08, R11 @@ -16166,15 +14780,15 @@ match_nolit_dst_ok_encodeSnappyBetterBlockAsm: LEAL -1(CX), DI SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 SHLQ $0x08, R13 IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R13 MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) + MOVL DI, 524312(SP)(R11*4) MOVL DI, 24(SP)(R13*4) JMP search_loop_encodeSnappyBetterBlockAsm @@ -16410,12 +15024,22 @@ search_loop_encodeSnappyBetterBlockAsm64K: MOVL 262168(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm64K - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm64K - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm64K + CMPQ R11, DI + JNE no_short_found_encodeSnappyBetterBlockAsm64K + MOVL R8, SI + JMP candidate_match_encodeSnappyBetterBlockAsm64K + +no_short_found_encodeSnappyBetterBlockAsm64K: + CMPL R10, DI + JEQ candidate_match_encodeSnappyBetterBlockAsm64K + CMPL R11, DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm64K + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm64K candidateS_match_encodeSnappyBetterBlockAsm64K: SHRQ $0x08, DI @@ -16954,12 +15578,22 @@ search_loop_encodeSnappyBetterBlockAsm12B: MOVL 65560(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 65560(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm12B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm12B + CMPQ R11, DI + JNE no_short_found_encodeSnappyBetterBlockAsm12B + MOVL R8, SI + JMP candidate_match_encodeSnappyBetterBlockAsm12B + +no_short_found_encodeSnappyBetterBlockAsm12B: + CMPL R10, DI + JEQ candidate_match_encodeSnappyBetterBlockAsm12B + CMPL R11, DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm12B candidateS_match_encodeSnappyBetterBlockAsm12B: SHRQ $0x08, DI @@ -17498,12 +16132,22 @@ search_loop_encodeSnappyBetterBlockAsm10B: MOVL 16408(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 16408(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm10B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm10B + CMPQ R11, DI + JNE no_short_found_encodeSnappyBetterBlockAsm10B + MOVL R8, SI + JMP candidate_match_encodeSnappyBetterBlockAsm10B + +no_short_found_encodeSnappyBetterBlockAsm10B: + CMPL R10, DI + JEQ candidate_match_encodeSnappyBetterBlockAsm10B + CMPL R11, DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm10B candidateS_match_encodeSnappyBetterBlockAsm10B: SHRQ $0x08, DI @@ -18042,12 +16686,22 @@ search_loop_encodeSnappyBetterBlockAsm8B: MOVL 4120(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 4120(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm8B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm8B + CMPQ R11, DI + JNE no_short_found_encodeSnappyBetterBlockAsm8B + MOVL R8, SI + JMP candidate_match_encodeSnappyBetterBlockAsm8B + +no_short_found_encodeSnappyBetterBlockAsm8B: + CMPL R10, DI + JEQ candidate_match_encodeSnappyBetterBlockAsm8B + CMPL R11, DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm8B candidateS_match_encodeSnappyBetterBlockAsm8B: SHRQ $0x08, DI From c330a559268adb35087692bacff84e265b572fae Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Wed, 29 Jun 2022 09:10:46 +0200 Subject: [PATCH 4/7] s2: Hash in matches (asm missing) --- s2/encode_better.go | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/s2/encode_better.go b/s2/encode_better.go index 889ffde9a5..e78c330307 100644 --- a/s2/encode_better.go +++ b/s2/encode_better.go @@ -60,7 +60,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { maxLTableSize = 1 << lTableBits // Short hash matches. - sTableBits = 14 + sTableBits = 15 maxSTableSize = 1 << sTableBits ) @@ -245,16 +245,24 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { // Index match end-2 (long) and end-1 (short) index1 := s - 2 - cv0 := load64(src, index0) - cv1 := load64(src, index1) cv = load64(src, s) - lTable[hash7(cv0, lTableBits)] = uint32(index0) - lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1) - lTable[hash7(cv1, lTableBits)] = uint32(index1) - lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1) - sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) - sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2) - sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + for index0 < index1 { + cv0 := load64(src, index0) + cv1 := load64(src, index1) + lTable[hash7(cv0, lTableBits)] = uint32(index0) + cv0 >>= 8 + index0++ + lTable[hash7(cv0, lTableBits)] = uint32(index0) + sTable[hash4(cv0, sTableBits)] = uint32(index0) + + lTable[hash7(cv1, lTableBits)] = uint32(index1) + cv1 >>= 8 + index1++ + lTable[hash7(cv1, lTableBits)] = uint32(index1) + sTable[hash4(cv1, sTableBits)] = uint32(index1) + index0 += 1 // (effectively +=2) + index1 -= 4 // (effectively -=3) + } } emitRemainder: From d66b94acfa4323a063accd48b8dcd11323b54a99 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 12 Jul 2022 11:24:40 +0200 Subject: [PATCH 5/7] Add hashes in repeats. Simplify hashing. --- s2/encode_better.go | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/s2/encode_better.go b/s2/encode_better.go index e78c330307..81696922ef 100644 --- a/s2/encode_better.go +++ b/s2/encode_better.go @@ -60,7 +60,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { maxLTableSize = 1 << lTableBits // Short hash matches. - sTableBits = 15 + sTableBits = 14 maxSTableSize = 1 << sTableBits ) @@ -116,7 +116,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { // regressions significantly. const wantRepeatBytes = 6 const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep) - if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask { + if true && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask { base := s + checkRep // Extend back for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { @@ -150,6 +150,22 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { if s >= sLimit { goto emitRemainder } + // Index in-between + index0 := base + 1 + index1 := s - 2 + + cv = load64(src, s) + for index0 < index1 { + cv0 := load64(src, index0) + cv1 := load64(src, index1) + lTable[hash7(cv0, lTableBits)] = uint32(index0) + sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) + + lTable[hash7(cv1, lTableBits)] = uint32(index1) + sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + index0 += 2 + index1 -= 2 + } cv = load64(src, s) continue @@ -240,9 +256,9 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { // Do we have space for more, if not bail. return 0 } - // Index match start+1 (long) and start+2 (short) + + // Index in-between index0 := base + 1 - // Index match end-2 (long) and end-1 (short) index1 := s - 2 cv = load64(src, s) @@ -250,18 +266,12 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { cv0 := load64(src, index0) cv1 := load64(src, index1) lTable[hash7(cv0, lTableBits)] = uint32(index0) - cv0 >>= 8 - index0++ - lTable[hash7(cv0, lTableBits)] = uint32(index0) - sTable[hash4(cv0, sTableBits)] = uint32(index0) + sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) lTable[hash7(cv1, lTableBits)] = uint32(index1) - cv1 >>= 8 - index1++ - lTable[hash7(cv1, lTableBits)] = uint32(index1) - sTable[hash4(cv1, sTableBits)] = uint32(index1) - index0 += 1 // (effectively +=2) - index1 -= 4 // (effectively -=3) + sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + index0 += 2 + index1 -= 2 } } From 8230d2e0c8a594cbeee510488317c06a12455c2d Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Thu, 8 Sep 2022 16:06:52 +0200 Subject: [PATCH 6/7] Add intermediate hashes to asm --- s2/_generate/gen.go | 50 ++- s2/encode_better.go | 46 ++- s2/encodeblock_amd64.s | 670 ++++++++++++++++++++--------------------- 3 files changed, 398 insertions(+), 368 deletions(-) diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index 577fef2fee..19120783ad 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -1354,11 +1354,57 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk } } Label("match_nolit_dst_ok_" + name) - // cv must be set to value at base+1 before arriving here if true { lHasher := hashN(lHashBytes, lTableBits) sHasher := hashN(sHashBytes, sTableBits) + index0, index1 := GP64(), GP64() + // index0 := base + 1 + LEAQ(Mem{Base: base, Disp: 1}, index0) + // index1 := s - 2 + LEAQ(Mem{Base: s, Disp: -2}, index1) + hash0l, hash0s, hash1l, hash1s := GP64(), GP64(), GP64(), GP64() + MOVQ(Mem{Base: src, Index: index0, Scale: 1, Disp: 0}, hash0l) + MOVQ(Mem{Base: src, Index: index0, Scale: 1, Disp: 1}, hash0s) + MOVQ(Mem{Base: src, Index: index1, Scale: 1, Disp: 0}, hash1l) + MOVQ(Mem{Base: src, Index: index1, Scale: 1, Disp: 1}, hash1s) + + lHasher.hash(hash0l) + sHasher.hash(hash0s) + lHasher.hash(hash1l) + sHasher.hash(hash1s) + + plusone0, plusone1 := GP64(), GP64() + LEAQ(Mem{Base: index0, Disp: 1}, plusone0) + LEAQ(Mem{Base: index1, Disp: 1}, plusone1) + MOVL(index0.As32(), lTab.Idx(hash0l, 4)) + MOVL(index1.As32(), lTab.Idx(hash1l, 4)) + MOVL(plusone0.As32(), sTab.Idx(hash0s, 4)) + MOVL(plusone1.As32(), sTab.Idx(hash1s, 4)) + + ADDQ(U8(1), index0) + SUBQ(U8(1), index1) + + Label("index_loop_" + name) + CMPQ(index0, index1) + JAE(LabelRef("search_loop_" + name)) + hash0l, hash1l = GP64(), GP64() + MOVQ(Mem{Base: src, Index: index0, Scale: 1, Disp: 0}, hash0l) + MOVQ(Mem{Base: src, Index: index1, Scale: 1, Disp: 0}, hash1l) + + lHasher.hash(hash0l) + lHasher.hash(hash1l) + + MOVL(index0.As32(), lTab.Idx(hash0l, 4)) + MOVL(index1.As32(), lTab.Idx(hash1l, 4)) + + ADDQ(U8(2), index0) + SUBQ(U8(2), index1) + JMP(LabelRef("index_loop_" + name)) + } else { + lHasher := hashN(lHashBytes, lTableBits) + sHasher := hashN(sHashBytes, sTableBits) + // Index base+1 long, base+2 short... cv := GP64() INCL(base) @@ -1428,8 +1474,8 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk MOVL(sm2, lTab.Idx(hash0, 4)) MOVL(sm1, sTab.Idx(hash1, 4)) MOVL(sm1, lTab.Idx(hash3, 4)) + JMP(LabelRef("search_loop_" + name)) } - JMP(LabelRef("search_loop_" + name)) Label("emit_remainder_" + name) // Bail if we exceed the maximum size. diff --git a/s2/encode_better.go b/s2/encode_better.go index c8da18c153..3b66ba42bf 100644 --- a/s2/encode_better.go +++ b/s2/encode_better.go @@ -117,7 +117,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { // regressions significantly. const wantRepeatBytes = 6 const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep) - if true && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask { + if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask { base := s + checkRep // Extend back for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { @@ -258,19 +258,25 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { return 0 } - // Index in-between + // Index short & long index0 := base + 1 index1 := s - 2 + cv0 := load64(src, index0) + cv1 := load64(src, index1) + lTable[hash7(cv0, lTableBits)] = uint32(index0) + sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) + + lTable[hash7(cv1, lTableBits)] = uint32(index1) + sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + index0 += 1 + index1 -= 1 cv = load64(src, s) - for index0 < index1 { - cv0 := load64(src, index0) - cv1 := load64(src, index1) - lTable[hash7(cv0, lTableBits)] = uint32(index0) - sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) - lTable[hash7(cv1, lTableBits)] = uint32(index1) - sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + // index every second long in between. + for index0 < index1 { + lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0) + lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1) index0 += 2 index1 -= 2 } @@ -435,21 +441,29 @@ func encodeBlockBetterSnappyGo(dst, src []byte) (d int) { // Do we have space for more, if not bail. return 0 } - // Index match start+1 (long) and start+2 (short) + + // Index short & long index0 := base + 1 - // Index match end-2 (long) and end-1 (short) index1 := s - 2 cv0 := load64(src, index0) cv1 := load64(src, index1) - cv = load64(src, s) lTable[hash7(cv0, lTableBits)] = uint32(index0) - lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1) - lTable[hash7(cv1, lTableBits)] = uint32(index1) - lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1) sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) - sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2) + + lTable[hash7(cv1, lTableBits)] = uint32(index1) sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + index0 += 1 + index1 -= 1 + cv = load64(src, s) + + // index every second long in between. + for index0 < index1 { + lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0) + lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1) + index0 += 2 + index1 -= 2 + } } emitRemainder: diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index 8009706b88..81a487d6de 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -6599,52 +6599,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm: match_nolit_dst_ok_encodeBetterBlockAsm: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x2f, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x2f, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 + SHLQ $0x08, R12 + IMULQ SI, R12 + SHRQ $0x2f, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x32, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 524312(SP)(R11*4) - MOVL R15, 524312(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 524312(SP)(R11*4) + MOVL R14, 524312(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeBetterBlockAsm: + CMPQ DI, R9 + JAE search_loop_encodeBetterBlockAsm + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x08, R8 + IMULQ SI, R8 + SHRQ $0x2f, R8 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x2f, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x2f, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 524312(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeBetterBlockAsm emit_remainder_encodeBetterBlockAsm: MOVQ src_len+32(FP), CX @@ -7619,52 +7616,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: match_nolit_dst_ok_encodeBetterBlockAsm4MB: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x2f, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x2f, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 + SHLQ $0x08, R12 + IMULQ SI, R12 + SHRQ $0x2f, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x32, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 524312(SP)(R11*4) - MOVL R15, 524312(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 524312(SP)(R11*4) + MOVL R14, 524312(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeBetterBlockAsm4MB: + CMPQ DI, R9 + JAE search_loop_encodeBetterBlockAsm4MB + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x08, R8 + IMULQ SI, R8 + SHRQ $0x2f, R8 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x2f, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x2f, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 524312(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm4MB + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeBetterBlockAsm4MB emit_remainder_encodeBetterBlockAsm4MB: MOVQ src_len+32(FP), CX @@ -8476,52 +8470,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm12B: match_nolit_dst_ok_encodeBetterBlockAsm12B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x34, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x34, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x32, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x34, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 65560(SP)(R11*4) - MOVL R15, 65560(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 65560(SP)(R11*4) + MOVL R14, 65560(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeBetterBlockAsm12B: + CMPQ DI, R9 + JAE search_loop_encodeBetterBlockAsm12B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x32, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 65560(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm12B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeBetterBlockAsm12B emit_remainder_encodeBetterBlockAsm12B: MOVQ src_len+32(FP), CX @@ -9322,52 +9313,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm10B: match_nolit_dst_ok_encodeBetterBlockAsm10B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x36, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x34, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x36, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 16408(SP)(R11*4) - MOVL R15, 16408(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 16408(SP)(R11*4) + MOVL R14, 16408(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeBetterBlockAsm10B: + CMPQ DI, R9 + JAE search_loop_encodeBetterBlockAsm10B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x34, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 16408(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm10B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeBetterBlockAsm10B emit_remainder_encodeBetterBlockAsm10B: MOVQ src_len+32(FP), CX @@ -10154,52 +10142,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm8B: match_nolit_dst_ok_encodeBetterBlockAsm8B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x38, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x36, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x38, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 4120(SP)(R11*4) - MOVL R15, 4120(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 4120(SP)(R11*4) + MOVL R14, 4120(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeBetterBlockAsm8B: + CMPQ DI, R9 + JAE search_loop_encodeBetterBlockAsm8B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x36, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 4120(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm8B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeBetterBlockAsm8B emit_remainder_encodeBetterBlockAsm8B: MOVQ src_len+32(FP), CX @@ -14744,52 +14729,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm: match_nolit_dst_ok_encodeSnappyBetterBlockAsm: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x2f, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x2f, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 + SHLQ $0x08, R12 + IMULQ SI, R12 + SHRQ $0x2f, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x32, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 524312(SP)(R11*4) - MOVL R15, 524312(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 524312(SP)(R11*4) + MOVL R14, 524312(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm: + CMPQ DI, R9 + JAE search_loop_encodeSnappyBetterBlockAsm + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x08, R8 + IMULQ SI, R8 + SHRQ $0x2f, R8 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x2f, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x2f, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 524312(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeSnappyBetterBlockAsm emit_remainder_encodeSnappyBetterBlockAsm: MOVQ src_len+32(FP), CX @@ -15317,52 +15299,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K: match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 + SHLQ $0x08, R12 + IMULQ SI, R12 + SHRQ $0x30, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x32, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 262168(SP)(R11*4) + MOVL R14, 262168(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm64K: + CMPQ DI, R9 + JAE search_loop_encodeSnappyBetterBlockAsm64K + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x08, R8 + IMULQ SI, R8 + SHRQ $0x30, R8 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm64K + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeSnappyBetterBlockAsm64K emit_remainder_encodeSnappyBetterBlockAsm64K: MOVQ src_len+32(FP), CX @@ -15871,52 +15850,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B: match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x34, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x34, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x32, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x34, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 65560(SP)(R11*4) - MOVL R15, 65560(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 65560(SP)(R11*4) + MOVL R14, 65560(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm12B: + CMPQ DI, R9 + JAE search_loop_encodeSnappyBetterBlockAsm12B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x32, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 65560(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm12B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeSnappyBetterBlockAsm12B emit_remainder_encodeSnappyBetterBlockAsm12B: MOVQ src_len+32(FP), CX @@ -16425,52 +16401,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B: match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x36, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x34, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x36, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 16408(SP)(R11*4) - MOVL R15, 16408(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 16408(SP)(R11*4) + MOVL R14, 16408(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm10B: + CMPQ DI, R9 + JAE search_loop_encodeSnappyBetterBlockAsm10B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x34, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 16408(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm10B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeSnappyBetterBlockAsm10B emit_remainder_encodeSnappyBetterBlockAsm10B: MOVQ src_len+32(FP), CX @@ -16977,52 +16950,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B: match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x38, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x36, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x38, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 4120(SP)(R11*4) - MOVL R15, 4120(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 4120(SP)(R11*4) + MOVL R14, 4120(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm8B: + CMPQ DI, R9 + JAE search_loop_encodeSnappyBetterBlockAsm8B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x36, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 4120(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm8B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeSnappyBetterBlockAsm8B emit_remainder_encodeSnappyBetterBlockAsm8B: MOVQ src_len+32(FP), CX From 27954d245dfe9f1410d3ec261f61da07865918fd Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Thu, 8 Sep 2022 16:16:28 +0200 Subject: [PATCH 7/7] Update golden --- snappy/snappy_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snappy/snappy_test.go b/snappy/snappy_test.go index 9cf1ad1d2d..bb51f0becb 100644 --- a/snappy/snappy_test.go +++ b/snappy/snappy_test.go @@ -665,7 +665,7 @@ func TestWriterGoldenOutput(t *testing.T) { "\x0d\x01", // Compressed payload: tagCopy1, length=7, offset=1. "\x08\x65\x66\x43", // Compressed payload: tagLiteral, length=3, "efC". "\x4e\x01\x00", // Compressed payload: tagCopy2, length=20, offset=1. - "\x4e\x58\x00", // Compressed payload: tagCopy2, length=20, offset=88. + "\x4e\x38\x00", // Compressed payload: tagCopy2, length=20, offset=56. "\x00\x67", // Compressed payload: tagLiteral, length=1, "g". }, "") if got != want {