Positions are now mostly correct

Still a few suboptimal ones: ––– string-bad-escape.toml ––––––––––––––––––––––––––––––––––––––––––– toml: error: invalid escape character 'a'; only the following escape characters are allowed: \b, \t, \n, \f, \r, \", \\, \uXXXX, and \UXXXXXXXX at line 1; start 18; length 24 last key parsed was "invalid-escape" 1 | invalid-escape = "This string has a bad \a escape character." ^^^^^^^^^^^^^^^^^^^^^^^^ ––– string-bad-multiline.toml –––––––––––––––––––––––––––––––––––––––– toml: error: strings cannot contain newlines at line 1; start 7; length 0 last key parsed was "multi" 1 | multi = "first line ^ ––– string-literal-multiline-quotes-1.toml ––––––––––––––––––––––––––– toml: error: unexpected "''''''" at line 1; start 7; length 20 last key parsed was "a" 1 | a = '''6 apostrophes: '''''' ^^^^^^^^^^^^^^^^^^^^ ––– string-wrong-close.toml –––––––––––––––––––––––––––––––––––––––––– toml: error: strings cannot contain newlines at line 1; start 18; length 0 last key parsed was "bad-ending-quote" 1 | bad-ending-quote = "double and single' ^
BurntSushi · Jun 27, 2021 · 5a19592 · 5a19592
1 parent 9c821eb
commit 5a19592
Show file tree

Hide file tree

Showing 5 changed files with 153 additions and 102 deletions.
diff --git a/error.go b/error.go
@@ -9,19 +9,18 @@ import (
 //
 // For example invalid TOML syntax, duplicate keys, etc.
 type ParseError struct {
-	Message string
-	Line    int
-	Pos     int    // Byte offset
-	LastKey string // Last parsed key, may be blank.
-	Input   string
+	Message  string
+	Position Position
+	LastKey  string // Last parsed key, may be blank.
+	Input    string
 }
 
 func (pe ParseError) Error() string {
 	if pe.LastKey == "" {
-		return fmt.Sprintf("toml: line %d: %s", pe.Line, pe.Message)
+		return fmt.Sprintf("toml: %s: %s", pe.Position, pe.Message)
 	}
-	return fmt.Sprintf("toml: line %d (last key parsed '%s'): %s",
-		pe.Line, pe.LastKey, pe.Message)
+	return fmt.Sprintf("toml: %s (last key parsed '%s'): %s",
+		pe.Position, pe.LastKey, pe.Message)
 }
 
 // Clang error:
@@ -43,18 +42,26 @@ func (pe ParseError) Error() string {
 //
 // For more information about this error, try `rustc --explain E0425`.
 
+// ––– array-mixed-types-arrays-and-ints.toml –––––––––––––––––––––––––––
+// toml: error: Array contains values of type 'Integer' and 'Array', but arrays must be homogeneous.
+//              at line 1; column 1-15; byte offset 15
+//              last key parsed was "arrays-and-ints"
+//
+//      1 | arrays-and-ints =  [1, ["Arrays are not integers."]]
+//         ^^^^^^^^^^^^^^^
+//
+// This is on the key as the parser doesn't use the lex position.
 func (pe ParseError) ExtError() string {
 	if pe.Input == "" {
 		return pe.Error()
 	}
 
 	lines := strings.Split(pe.Input, "\n")
-	var line, pos, col int
+	var pos, col int
 	for i := range lines {
 		ll := len(lines[i]) + 1 // +1 for the removed newline
-		if pos+ll >= pe.Pos {
-			line = i
-			col = pe.Pos - pos - 1
+		if pos+ll >= pe.Position.Start {
+			col = pe.Position.Start - pos
 			if col < 0 { // Should never happen, but just in case.
 				col = 0
 			}
@@ -66,24 +73,32 @@ func (pe ParseError) ExtError() string {
 	b := new(strings.Builder)
 	//fmt.Fprintf(b, "toml: error on line %d: %s\n", line, pe.Message)
 	fmt.Fprintf(b, "toml: error: %s\n", pe.Message)
-	fmt.Fprintf(b, "             on line %d", line+1)
+	//fmt.Fprintf(b, "             on line %d", pe.Position.Line)
+	fmt.Fprintf(b, "             %s\n", pe.Position)
 	if pe.LastKey != "" {
-		fmt.Fprintf(b, "; last key parsed was %q", pe.LastKey)
+		fmt.Fprintf(b, "             last key parsed was %q", pe.LastKey)
 	}
 	b.WriteString("\n\n")
 
-	if line > 1 {
-		fmt.Fprintf(b, "% 6d | %s\n", line-1, lines[line-2])
+	if pe.Position.Line > 2 {
+		fmt.Fprintf(b, "% 6d | %s\n", pe.Position.Line-2, lines[pe.Position.Line-3])
 	}
-	if line > 0 {
-		fmt.Fprintf(b, "% 6d | %s\n", line, lines[line-1])
+	if pe.Position.Line > 1 {
+		fmt.Fprintf(b, "% 6d | %s\n", pe.Position.Line-1, lines[pe.Position.Line-2])
+	}
+
+	l := pe.Position.Len - 1
+	if l < 0 {
+		l = 0
 	}
 
-	fmt.Fprintf(b, "% 6d | %s\n", line+1, lines[line])
-	fmt.Fprintf(b, "% 9s%s^\n", "", strings.Repeat(" ", col))
+	fmt.Fprintf(b, "% 6d | %s\n", pe.Position.Line, lines[pe.Position.Line-1])
+	fmt.Fprintf(b, "% 9s%s%s\n", "",
+		strings.Repeat(" ", col),
+		strings.Repeat("^", l+1))
 
-	// if len(lines)-1 > line && lines[line+1] != "" {
-	// 	fmt.Fprintf(b, "% 6d | %s\n", line+1, lines[line+1])
+	// if len(lines)-1 > pe.Position.Line && lines[pe.Position.Line+1] != "" {
+	// 	fmt.Fprintf(b, "% 6d | %s\n", pe.Position.Line+1, lines[pe.Position.Line+1])
 	// }
 
 	return b.String()

diff --git a/error_test.go b/error_test.go
@@ -57,6 +57,9 @@ func TestParseError(t *testing.T) {
 		if !strings.HasSuffix(f.Name(), ".toml") {
 			continue
 		}
+		if f.Name() != "datetime-no-secs.toml" {
+			//continue
+		}
 
 		if f.Name() == "string-multiline-escape-space.toml" {
 			continue

diff --git a/lex.go b/lex.go
@@ -59,35 +59,43 @@ const (
 
 type stateFn func(lx *lexer) stateFn
 
+type Position struct {
+	Line  int // Line number, starting at 1.
+	Start int // Start of error, as byte offset starting at 0.
+	Len   int // Lenght in bytes; may be 0.
+}
+
+func (p Position) String() string {
+	return fmt.Sprintf("at line %d; start %d; length %d", p.Line, p.Start, p.Len)
+}
+
 type lexer struct {
-	input string
-	start int
-	pos   int
-	line  int
-	state stateFn
-	items chan item
-
-	// Allow for backing up up to four runes.
-	// This is necessary because TOML contains 3-rune tokens (""" and ''').
+	input   string
+	start   int
+	pos     int
+	itemPos Position
+	state   stateFn
+	items   chan item
+
+	// Allow for backing up up to four runes. This is necessary because TOML
+	// contains 3-rune tokens (""" and ''').
 	prevWidths [4]int
-	nprev      int // how many of prevWidths are in use
-	// If we emit an eof, we can still back up, but it is not OK to call
-	// next again.
-	atEOF bool
+	nprev      int  // how many of prevWidths are in use
+	atEOF      bool // If we emit an eof, we can still back up, but it is not OK to call next again.
 
 	// A stack of state functions used to maintain context.
-	// The idea is to reuse parts of the state machine in various places.
-	// For example, values can appear at the top level or within arbitrarily
-	// nested arrays. The last state on the stack is used after a value has
-	// been lexed. Similarly for comments.
+	//
+	// The idea is to reuse parts of the state machine in various places. For
+	// example, values can appear at the top level or within arbitrarily nested
+	// arrays. The last state on the stack is used after a value has been lexed.
+	// Similarly for comments.
 	stack []stateFn
 }
 
 type item struct {
-	typ  itemType
-	val  string
-	line int
-	pos  int
+	typ itemType
+	val string
+	pos Position
 }
 
 func (lx *lexer) nextItem() item {
@@ -97,18 +105,19 @@ func (lx *lexer) nextItem() item {
 			return item
 		default:
 			lx.state = lx.state(lx)
-			//fmt.Printf("     STATE %-24s   current: %-10q   stack: %s\n", lx.state, lx.current(), lx.stack)
+			//fmt.Printf("     STATE %-24s  current: %-10q   %d:%d-%d   stack: %s\n",
+			//	lx.state, lx.current(), lx.itemPos.Line, lx.itemPos.Start, lx.itemPos.Len, lx.stack)
 		}
 	}
 }
 
 func lex(input string) *lexer {
 	lx := &lexer{
-		input: input,
-		state: lexTop,
-		line:  1,
-		items: make(chan item, 10),
-		stack: make([]stateFn, 0, 10),
+		input:   input,
+		state:   lexTop,
+		itemPos: Position{Line: 1},
+		items:   make(chan item, 10),
+		stack:   make([]stateFn, 0, 10),
 	}
 	return lx
 }
@@ -131,12 +140,16 @@ func (lx *lexer) current() string {
 }
 
 func (lx *lexer) emit(typ itemType) {
-	lx.items <- item{typ, lx.current(), lx.line, lx.pos}
+	lx.itemPos.Start = lx.start
+	lx.itemPos.Len = lx.pos - lx.start
+	lx.items <- item{typ, lx.current(), lx.itemPos}
 	lx.start = lx.pos
 }
 
 func (lx *lexer) emitTrim(typ itemType) {
-	lx.items <- item{typ, strings.TrimSpace(lx.current()), lx.line, lx.pos}
+	lx.itemPos.Start = lx.start
+	lx.itemPos.Len = lx.pos - lx.start
+	lx.items <- item{typ, strings.TrimSpace(lx.current()), lx.itemPos}
 	lx.start = lx.pos
 }
 
@@ -150,7 +163,7 @@ func (lx *lexer) next() (r rune) {
 	}
 
 	if lx.input[lx.pos] == '\n' {
-		lx.line++
+		lx.itemPos.Line++
 	}
 	lx.prevWidths[3] = lx.prevWidths[2]
 	lx.prevWidths[2] = lx.prevWidths[1]
@@ -161,7 +174,7 @@ func (lx *lexer) next() (r rune) {
 
 	r, w := utf8.DecodeRuneInString(lx.input[lx.pos:])
 	if r == utf8.RuneError {
-		lx.errorf("invalid UTF-8 byte at position %d (line %d): 0x%02x", lx.pos, lx.line, lx.input[lx.pos])
+		lx.errorf("invalid UTF-8 byte at %d: 0x%02x", lx.itemPos, lx.input[lx.pos])
 		return utf8.RuneError
 	}
 
@@ -189,9 +202,10 @@ func (lx *lexer) backup() {
 	lx.prevWidths[1] = lx.prevWidths[2]
 	lx.prevWidths[2] = lx.prevWidths[3]
 	lx.nprev--
+
 	lx.pos -= w
 	if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' {
-		lx.line--
+		lx.itemPos.Line--
 	}
 }
 
@@ -228,7 +242,25 @@ func (lx *lexer) skip(pred func(rune) bool) {
 // Note that any value that is a character is escaped if it's a special
 // character (newlines, tabs, etc.).
 func (lx *lexer) errorf(format string, values ...interface{}) stateFn {
-	lx.items <- item{itemError, fmt.Sprintf(format, values...), lx.line, lx.pos}
+	if lx.atEOF {
+		return lx.errorfPrevline(format, values...)
+	}
+
+	lx.itemPos.Start = lx.start
+	lx.itemPos.Len = lx.pos - lx.start
+	lx.items <- item{itemError, fmt.Sprintf(format, values...), lx.itemPos}
+	return nil
+}
+
+// errorfPrevline is like errorf, but sets the position to the last column of
+// the previous line.
+//
+// This is so that unexpected EOF or NL errors don't show on a new blank line.
+func (lx *lexer) errorfPrevline(format string, values ...interface{}) stateFn {
+	lx.itemPos.Line--
+	lx.itemPos.Len = 0
+	lx.itemPos.Start++
+	lx.items <- item{itemError, fmt.Sprintf(format, values...), lx.itemPos}
 	return nil
 }
 
@@ -537,8 +569,7 @@ func lexArrayValue(lx *lexer) stateFn {
 // the next value (or the end of the array): it ignores whitespace and newlines
 // and expects either a ',' or a ']'.
 func lexArrayValueEnd(lx *lexer) stateFn {
-	r := lx.next()
-	switch {
+	switch r := lx.next(); {
 	case isWhitespace(r) || isNL(r):
 		return lexSkip(lx, lexArrayValueEnd)
 	case r == commentStart:
@@ -549,10 +580,11 @@ func lexArrayValueEnd(lx *lexer) stateFn {
 		return lexArrayValue // move on to the next value
 	case r == arrayEnd:
 		return lexArrayEnd
+	default:
+		return lx.errorf(
+			"expected a comma or array terminator %q, but got %s instead",
+			arrayEnd, runeOrEOF(r))
 	}
-	return lx.errorf(
-		"expected a comma or array terminator %q, but got %s instead",
-		arrayEnd, runeOrEOF(r))
 }
 
 // lexArrayEnd finishes the lexing of an array.
@@ -571,7 +603,7 @@ func lexInlineTableValue(lx *lexer) stateFn {
 	case isWhitespace(r):
 		return lexSkip(lx, lexInlineTableValue)
 	case isNL(r):
-		return lx.errorf("newlines not allowed within inline tables")
+		return lx.errorfPrevline("newlines not allowed within inline tables")
 	case r == commentStart:
 		lx.push(lexInlineTableValue)
 		return lexCommentStart
@@ -593,7 +625,7 @@ func lexInlineTableValueEnd(lx *lexer) stateFn {
 	case isWhitespace(r):
 		return lexSkip(lx, lexInlineTableValueEnd)
 	case isNL(r):
-		return lx.errorf("newlines not allowed within inline tables")
+		return lx.errorfPrevline("newlines not allowed within inline tables")
 	case r == commentStart:
 		lx.push(lexInlineTableValueEnd)
 		return lexCommentStart
@@ -638,7 +670,7 @@ func lexString(lx *lexer) stateFn {
 	case isControl(r) || r == '\r':
 		return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
 	case isNL(r):
-		return lx.errorf("strings cannot contain newlines")
+		return lx.errorfPrevline("strings cannot contain newlines")
 	case r == '\\':
 		lx.push(lexString)
 		return lexStringEscape
@@ -714,7 +746,7 @@ func lexRawString(lx *lexer) stateFn {
 	case isControl(r) || r == '\r':
 		return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
 	case isNL(r):
-		return lx.errorf("strings cannot contain newlines")
+		return lx.errorfPrevline("strings cannot contain newlines")
 	case r == rawStringEnd:
 		lx.backup()
 		lx.emit(itemRawString)