skip UTF-8 BOM also (#381)

Co-authored-by: Martin Tournoij <martin@arp242.net>
BurntSushi · Jan 28, 2023 · 1a6ca6e · 1a6ca6e
1 parent bd94408
commit 1a6ca6e
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 2 deletions.
diff --git a/decode_test.go b/decode_test.go
@@ -66,6 +66,7 @@ func TestDecodeBOM(t *testing.T) {
 	for _, tt := range [][]byte{
 		[]byte("\xff\xfea = \"b\""),
 		[]byte("\xfe\xffa = \"b\""),
+		[]byte("\xef\xbb\xbfa = \"b\""),
 	} {
 		t.Run("", func(t *testing.T) {
 			var s struct{ A string }

diff --git a/parse.go b/parse.go
@@ -47,9 +47,12 @@ func parse(data string) (p *parser, err error) {
 	}()
 
 	// Read over BOM; do this here as the lexer calls utf8.DecodeRuneInString()
-	// which mangles stuff.
-	if strings.HasPrefix(data, "\xff\xfe") || strings.HasPrefix(data, "\xfe\xff") {
+	// which mangles stuff. UTF-16 BOM isn't strictly valid, but some tools add
+	// it anyway.
+	if strings.HasPrefix(data, "\xff\xfe") || strings.HasPrefix(data, "\xfe\xff") { // UTF-16
 		data = data[2:]
+	} else if strings.HasPrefix(data, "\xef\xbb\xbf") { // UTF-8
+		data = data[3:]
 	}
 
 	// Examine first few bytes for NULL bytes; this probably means it's a UTF-16