alecthomas · pontaoski · Dec 15, 2021 · alecthomas · Dec 15, 2021 · pontaoski
diff --git a/api.go b/api.go
@@ -17,3 +17,10 @@ type Parseable interface {
 	// Nil should be returned if parsing was successful.
 	Parse(lex *lexer.PeekingLexer) error
 }
+
+// The Fuzzable interface can be implemented by any element in the grammar to provide custom fuzzing.
+type Fuzzable interface {
+	// Generate a valid string that can be parsed to get a value from
+	// the corresponding Node.
+	Fuzz(l lexer.Fuzzer) string
+}
diff --git a/fuzzer_test.go b/fuzzer_test.go
@@ -0,0 +1,56 @@
+package participle_test
+
+import (
+	"math/rand"
+	"testing"
+	"time"
+
+	"github.com/alecthomas/participle/v2"
+	"github.com/alecthomas/participle/v2/lexer"
+	"github.com/alecthomas/repr"
+)
+
+func doFuzzTest(grammar interface{}, t *testing.T) {
+	parser := participle.MustBuild(grammar)
+
+	rand.Seed(0)
+
+	for i := 0; i < 5; i++ {
+		start := time.Now()
+		println("start fuzz")
+		data := parser.Fuzz(lexer.DefaultDefinition.(lexer.Fuzzer))
+		println("fuzz", (start.Sub(time.Now()).String()))
+
+		err := parser.ParseString("test", data, grammar)
+		if err != nil {
+			t.Fatalf("error parsing (%s): %s", repr.String(data), err)
+		}
+
+		println("parse", (start.Sub(time.Now()).String()))
+	}
+}
+
+func TestFuzz_LookAhead(t *testing.T) {
+	type val struct {
+		Str string `  @String`
+		Int int    `| @Int`
+	}
+	type op struct {
+		Op      string `@('+' | '*' (?= @Int))`
+		Operand val    `@@`
+	}
+	type sum struct {
+		Left val  `@@`
+		Ops  []op `@@*`
+	}
+
+	doFuzzTest(&sum{}, t)
+}
+
+func TestFuzz_Disjunction(t *testing.T) {
+	type grammar struct {
+		Whatever string `'a' | @String | 'b'`
+	}
+
+	doFuzzTest(&grammar{}, t)
+}
diff --git a/lexer/api.go b/lexer/api.go
@@ -45,6 +45,11 @@ type Lexer interface {
 	Next() (Token, error)
 }
 
+// A Fuzzer returns a random valid string for a given token kind
+type Fuzzer interface {
+	Fuzz(t TokenType) string
+}
+
 // SymbolsByRune returns a map of lexer symbol names keyed by rune.
 func SymbolsByRune(def Definition) map[TokenType]string {
 	symbols := def.Symbols()

diff --git a/lexer/text_scanner.go b/lexer/text_scanner.go
@@ -3,8 +3,12 @@ package lexer
 import (
 	"bytes"
 	"io"
+	"math"
+	"math/rand"
+	"strconv"
 	"strings"
 	"text/scanner"
+	"unicode"
 )
 
 // TextScannerLexer is a lexer that uses the text/scanner module.
@@ -48,6 +52,104 @@ func (d *textScannerLexerDefinition) Symbols() map[string]TokenType {
 	}
 }
 
+func count16(rang unicode.Range16) int {
+	return int(((rang.Hi - rang.Lo) / rang.Stride) + 1)
+}
+
+func count32(rang unicode.Range32) int {
+	return int(((rang.Hi - rang.Lo) / rang.Stride) + 1)
+}
+
+func totalRunesInRange(tables []*unicode.RangeTable) int {
+	total := 0
+	for _, table := range tables {
+		for _, r16 := range table.R16 {
+			total += count16(r16)
+		}
+		for _, r32 := range table.R32 {
+			total += count32(r32)
+		}
+	}
+	return total
+}
+
+// we're pretending the tables are smushed up against
+// eachother here
+func nthRuneFromTables(at int, tables []*unicode.RangeTable) (ret rune) {
+	n := at
+
+	for _, table := range tables {
+		for _, rang := range table.R16 {
+			num := count16(rang)
+			if n <= num-1 {
+				return rune(int(rang.Lo) + (int(rang.Stride) * n))
+			}
+			n -= num
+		}
+		for _, rang := range table.R32 {
+			num := count32(rang)
+			if n <= num-1 {
+				return rune(int(rang.Lo) + (int(rang.Stride) * n))
+			}
+			n -= num
+		}
+	}
+
+	return ' '
+}
+
+func randomRune(len int, tables ...*unicode.RangeTable) rune {
+	return nthRuneFromTables(
+		rand.Intn(len),
+		tables)
+}
+
+var cleaner = strings.NewReplacer(
+	"\x00", "",
+)
+
+var defaultTableCount = totalRunesInRange([]*unicode.RangeTable{unicode.Letter, unicode.Symbol, unicode.Number})
+var letterTableCount = totalRunesInRange([]*unicode.RangeTable{unicode.Letter})
+var letterNumberTableCount = totalRunesInRange([]*unicode.RangeTable{unicode.Letter, unicode.Number})
+
+func randomString(length int, tableLength int, tables ...*unicode.RangeTable) string {
+	s := make([]rune, length)
+
+	if len(tables) == 0 {
+		tables = append(tables, unicode.Letter, unicode.Symbol, unicode.Number)
+	}
+
+	for i := 0; i < length; i++ {
+		char := randomRune(tableLength, tables...)
+		s = append(s, char)
+	}
+
+	return cleaner.Replace(string(s))
+}
+
+func (d *textScannerLexerDefinition) Fuzz(t TokenType) string {
+	switch t {
+	case EOF:
+		return ""
+	case scanner.Char:
+		return string(rune(rand.Intn(math.MaxInt)))
+	case scanner.Ident:
+		return string(randomRune(letterTableCount, unicode.Letter)) + randomString(rand.Intn(100), letterNumberTableCount, unicode.Letter, unicode.Number)
+	case scanner.Int:
+		return strconv.Itoa(rand.Int())
+	case scanner.Float:
+		return strconv.FormatFloat(rand.Float64(), 'f', -1, 64)
+	case scanner.String:
+		return `"` + strings.ReplaceAll(randomString(rand.Intn(50), defaultTableCount), "\n", " ") + `"`
+	case scanner.RawString:
+		return "`" + randomString(rand.Intn(50), defaultTableCount) + "`"
+	case scanner.Comment:
+		return randomString(rand.Intn(50), defaultTableCount)
+	default:
+		return string(rune(t))
+	}
+}
+
 // textScannerLexer is a Lexer based on text/scanner.Scanner
 type textScannerLexer struct {
 	scanner  *scanner.Scanner

diff --git a/nodes.go b/nodes.go
@@ -4,6 +4,8 @@ import (
 	"encoding"
 	"errors"
 	"fmt"
+	"math"
+	"math/rand"
 	"reflect"
 	"strconv"
 	"strings"
@@ -34,6 +36,9 @@ type node interface {
 	// Returned slice will be nil if the node does not match.
 	Parse(ctx *parseContext, parent reflect.Value) ([]reflect.Value, error)
 
+	// Returns a random valid string that can be parsed to get a value
+	Fuzz(l lexer.Fuzzer) string
+
 	// Return a decent string representation of the Node.
 	fmt.Stringer
 
@@ -72,6 +77,15 @@ func (p *parseable) Parse(ctx *parseContext, parent reflect.Value) (out []reflec
 	return []reflect.Value{rv.Elem()}, nil
 }
 
+func (p *parseable) Fuzz(l lexer.Fuzzer) string {
+	rv := reflect.New(p.t)
+	v, ok := rv.Interface().(Fuzzable)
+	if !ok {
+		panic(fmt.Sprintf("%s does not support fuzzing", p.t))
+	}
+	return v.Fuzz(l)
+}
+
 // @@
 type strct struct {
 	typ              reflect.Type
@@ -125,6 +139,10 @@ func (s *strct) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Va
 	return []reflect.Value{sv}, ctx.Apply()
 }
 
+func (s *strct) Fuzz(l lexer.Fuzzer) string {
+	return s.expr.Fuzz(l)
+}
+
 func (s *strct) maybeInjectStartToken(token lexer.Token, v reflect.Value) {
 	if s.posFieldIndex == nil {
 		return
@@ -184,6 +202,40 @@ type group struct {
 	mode groupMatchMode
 }
 
+func (g *group) Fuzz(l lexer.Fuzzer) string {
+	var (
+		maxCount int
+		minCount int
+	)
+
+	switch g.mode {
+	case groupMatchOnce:
+		minCount, maxCount = 1, 1
+	case groupMatchZeroOrOne:
+		minCount, maxCount = 0, 1
+	case groupMatchZeroOrMore:
+		minCount, maxCount = 0, math.MaxInt
+	case groupMatchOneOrMore:
+		minCount, maxCount = 1, math.MaxInt
+	case groupMatchNonEmpty:
+		minCount, maxCount = 1, 1
+	}
+
+	var items int
+	if maxCount-minCount == 0 {
+		items = minCount
+	} else {
+		items = minCount + rand.Intn(maxCount-minCount)
+	}
+	var s strings.Builder
+	for i := 0; i < items; i++ {
+		s.WriteString(g.expr.Fuzz(l))
+		if i < items-1 {
+			s.WriteString(" ")
+		}
+	}
+	return s.String()
+}
 func (g *group) String() string   { return ebnf(g) }
 func (g *group) GoString() string { return fmt.Sprintf("group{%s}", g.mode) }
 func (g *group) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
@@ -254,6 +306,9 @@ type lookaheadGroup struct {
 
 func (n *lookaheadGroup) String() string   { return ebnf(n) }
 func (n *lookaheadGroup) GoString() string { return "lookaheadGroup{}" }
+func (n *lookaheadGroup) Fuzz(l lexer.Fuzzer) string {
+	return n.expr.Fuzz(l)
+}
 
 func (n *lookaheadGroup) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
 	// Create a branch to avoid advancing the parser as any match will be discarded
@@ -278,6 +333,9 @@ type disjunction struct {
 
 func (d *disjunction) String() string   { return ebnf(d) }
 func (d *disjunction) GoString() string { return "disjunction{}" }
+func (d *disjunction) Fuzz(l lexer.Fuzzer) string {
+	return d.nodes[rand.Intn(len(d.nodes))].Fuzz(l)
+}
 
 func (d *disjunction) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
 	var (
@@ -325,6 +383,16 @@ type sequence struct {
 
 func (s *sequence) String() string   { return ebnf(s) }
 func (s *sequence) GoString() string { return "sequence{}" }
+func (s *sequence) Fuzz(l lexer.Fuzzer) string {
+	var sb strings.Builder
+	for n := s; n != nil; n = n.next {
+		sb.WriteString(n.node.Fuzz(l))
+		if n.next != nil {
+			sb.WriteString(" ")
+		}
+	}
+	return sb.String()
+}
 
 func (s *sequence) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
 	for n := s; n != nil; n = n.next {
@@ -356,6 +424,9 @@ type capture struct {
 
 func (c *capture) String() string   { return ebnf(c) }
 func (c *capture) GoString() string { return "capture{}" }
+func (c *capture) Fuzz(l lexer.Fuzzer) string {
+	return c.node.Fuzz(l)
+}
 
 func (c *capture) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
 	start := ctx.RawCursor()
@@ -380,6 +451,9 @@ type reference struct {
 
 func (r *reference) String() string   { return ebnf(r) }
 func (r *reference) GoString() string { return fmt.Sprintf("reference{%s}", r.identifier) }
+func (r *reference) Fuzz(l lexer.Fuzzer) string {
+	return l.Fuzz(r.typ)
+}
 
 func (r *reference) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
 	token, err := ctx.Peek(0)
@@ -400,6 +474,13 @@ type optional struct {
 
 func (o *optional) String() string   { return ebnf(o) }
 func (o *optional) GoString() string { return "optional{}" }
+func (o *optional) Fuzz(l lexer.Fuzzer) string {
+	if rand.Intn(1) == 0 {
+		return ""
+	} else {
+		return o.node.Fuzz(l)
+	}
+}
 
 func (o *optional) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
 	branch := ctx.Branch()
@@ -425,6 +506,19 @@ type repetition struct {
 
 func (r *repetition) String() string   { return ebnf(r) }
 func (r *repetition) GoString() string { return "repetition{}" }
+func (r *repetition) Fuzz(l lexer.Fuzzer) string {
+	var (
+		s   strings.Builder
+		max = rand.Intn(100)
+	)
+	for i := 0; i < max; i++ {
+		s.WriteString(r.node.Fuzz(l))
+		if i < max-1 {
+			s.WriteString(" ")
+		}
+	}
+	return s.String()
+}
 
 // Parse a repetition. Once a repetition is encountered it will always match, so grammars
 // should ensure that branches are differentiated prior to the repetition.
@@ -466,6 +560,9 @@ type literal struct {
 
 func (l *literal) String() string   { return ebnf(l) }
 func (l *literal) GoString() string { return fmt.Sprintf("literal{%q, %q}", l.s, l.tt) }
+func (lit *literal) Fuzz(l lexer.Fuzzer) string {
+	return lit.s
+}
 
 func (l *literal) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
 	token, err := ctx.Peek(0)
@@ -494,6 +591,9 @@ type negation struct {
 
 func (n *negation) String() string   { return ebnf(n) }
 func (n *negation) GoString() string { return "negation{}" }
+func (n *negation) Fuzz(l lexer.Fuzzer) string {
+	panic("todo")
+}
 
 func (n *negation) Parse(ctx *parseContext, parent reflect.Value) (out []reflect.Value, err error) {
 	// Create a branch to avoid advancing the parser, but call neither Stop nor Accept on it