Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

runtime: add ERMS-based memmove support for modern CPU platforms #66959

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 7 additions & 11 deletions src/runtime/cpuflags_amd64.go
Expand Up @@ -8,17 +8,13 @@ import (
"internal/cpu"
)

var useAVXmemmove bool
var (
useAVXmemmove bool
useERMS bool
)

func init() {
// Let's remove stepping and reserved fields
processor := processorVersionInfo & 0x0FFF3FF0

isIntelBridgeFamily := isIntel &&
processor == 0x206A0 ||
processor == 0x206D0 ||
processor == 0x306A0 ||
processor == 0x306E0

useAVXmemmove = cpu.X86.HasAVX && !isIntelBridgeFamily
isERMSNiceCPU := isIntel
useERMS = isERMSNiceCPU && cpu.X86.HasERMS
useAVXmemmove = cpu.X86.HasAVX
}
41 changes: 21 additions & 20 deletions src/runtime/memmove_amd64.s
Expand Up @@ -72,45 +72,43 @@ tail:
CMPQ BX, $256
JBE move_129through256

TESTB $1, runtime·useAVXmemmove(SB)
JNZ avxUnaligned

/*
* check and set for backwards
*/
CMPQ SI, DI
JLS back

/*
* forward copy loop
*/
* forward copy loop
*/
forward:
CMPQ BX, $2048
JLS move_256through2048
// ERMS is slow if destination address is unaligned.
TESTQ $15, DI
JNZ check_avx

TESTB $1, runtime·useERMS(SB)
JNZ erms

// If REP MOVSB isn't fast, don't use it
CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
JNE fwdBy8
check_avx:
TESTB $1, runtime·useAVXmemmove(SB)
JNZ avxUnaligned

// Check alignment
MOVL SI, AX
ORL DI, AX
TESTL $7, AX
JEQ fwdBy8

// Do 1 byte at a time
MOVQ BX, CX
REP; MOVSB
RET
CMPQ BX, $2048
JLS move_256through2048

fwdBy8:
// Do 8 bytes at a time
MOVQ BX, CX
SHRQ $3, CX
ANDQ $7, BX
REP; MOVSQ
JMP tail

erms:
MOVQ BX, CX
REP; MOVSB
RET

back:
/*
* check overlap
Expand All @@ -119,6 +117,9 @@ back:
ADDQ BX, CX
CMPQ CX, DI
JLS forward

TESTB $1, runtime·useAVXmemmove(SB)
JNZ avxUnaligned
/*
* whole thing backwards has
* adjusted addresses
Expand Down