golang · cocotyty · Apr 22, 2024 · May 13, 2024 · May 17, 2024 · May 20, 2024
diff --git a/src/runtime/cpuflags_amd64.go b/src/runtime/cpuflags_amd64.go
@@ -8,17 +8,13 @@ import (
 	"internal/cpu"
 )
 
-var useAVXmemmove bool
+var (
+	useAVXmemmove bool
+	useERMS       bool
+)
 
 func init() {
-	// Let's remove stepping and reserved fields
-	processor := processorVersionInfo & 0x0FFF3FF0
-
-	isIntelBridgeFamily := isIntel &&
-		processor == 0x206A0 ||
-		processor == 0x206D0 ||
-		processor == 0x306A0 ||
-		processor == 0x306E0
-
-	useAVXmemmove = cpu.X86.HasAVX && !isIntelBridgeFamily
+	isERMSNiceCPU := isIntel
+	useERMS = isERMSNiceCPU && cpu.X86.HasERMS
+	useAVXmemmove = cpu.X86.HasAVX
 }
diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s
@@ -72,45 +72,43 @@ tail:
 	CMPQ	BX, $256
 	JBE	move_129through256
 
-	TESTB	$1, runtime·useAVXmemmove(SB)
-	JNZ	avxUnaligned
-
 /*
  * check and set for backwards
  */
 	CMPQ	SI, DI
 	JLS	back
 
 /*
- * forward copy loop
- */
+* forward copy loop
+*/
 forward:
-	CMPQ	BX, $2048
-	JLS	move_256through2048
+	// ERMS is slow if destination address is unaligned.
+	TESTQ	$15, DI
+	JNZ	check_avx
+
+	TESTB	$1, runtime·useERMS(SB)
+	JNZ	erms
 
-	// If REP MOVSB isn't fast, don't use it
-	CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
-	JNE	fwdBy8
+check_avx:
+	TESTB	$1, runtime·useAVXmemmove(SB)
+	JNZ	avxUnaligned
 
-	// Check alignment
-	MOVL	SI, AX
-	ORL	DI, AX
-	TESTL	$7, AX
-	JEQ	fwdBy8
 
-	// Do 1 byte at a time
-	MOVQ	BX, CX
-	REP;	MOVSB
-	RET
+	CMPQ	BX, $2048
+	JLS	move_256through2048
 
-fwdBy8:
 	// Do 8 bytes at a time
 	MOVQ	BX, CX
 	SHRQ	$3, CX
 	ANDQ	$7, BX
 	REP;	MOVSQ
 	JMP	tail
 
+erms:
+	MOVQ	BX, CX
+	REP;	MOVSB
+	RET
+
 back:
 /*
  * check overlap
@@ -119,6 +117,9 @@ back:
 	ADDQ	BX, CX
 	CMPQ	CX, DI
 	JLS	forward
+
+	TESTB	$1, runtime·useAVXmemmove(SB)
+	JNZ	avxUnaligned
 /*
  * whole thing backwards has
  * adjusted addresses