Skip to content

Commit

Permalink
feat: improve Decoder string unescaping performance
Browse files Browse the repository at this point in the history
before
goos: darwin
goarch: arm64
pkg: benchmark
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2396	    501217 ns/op	  610443 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2370	    506517 ns/op	  635933 B/op	   10006 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2365	    500938 ns/op	  612078 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2216	    508809 ns/op	  643572 B/op	   10006 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2354	    505128 ns/op	  621201 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2235	    506122 ns/op	  631196 B/op	   10006 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2334	    501138 ns/op	  611282 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2360	    501009 ns/op	  609720 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2343	    505980 ns/op	  608884 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2362	    504268 ns/op	  614357 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	      94	  12573430 ns/op	  586036 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	      96	  12545696 ns/op	  586047 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	      93	  12660176 ns/op	  591675 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	      85	  12624620 ns/op	  586036 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	      94	  12561482 ns/op	  591631 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	      94	  12597026 ns/op	  586034 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	      93	  12598319 ns/op	  586040 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	      93	  12613099 ns/op	  586039 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	      94	  12589107 ns/op	  597210 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	      96	  12599589 ns/op	  586044 B/op	   10014 allocs/op

after
goos: darwin
goarch: arm64
pkg: benchmark
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2396	    500616 ns/op	  615926 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2367	    508130 ns/op	  632242 B/op	   10006 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2360	    507290 ns/op	  622408 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2344	    503449 ns/op	  612675 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2325	    505496 ns/op	  617994 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2314	    504651 ns/op	  611184 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2290	    507516 ns/op	  615741 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2355	    514382 ns/op	  618502 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2136	    509533 ns/op	  617787 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Unmarshal-12         	    2216	    507166 ns/op	  612041 B/op	   10005 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	    2430	    489476 ns/op	  586677 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	    2415	    527139 ns/op	  588856 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	    2358	    489797 ns/op	  586700 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	    2437	    487946 ns/op	  586247 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	    2415	    488028 ns/op	  586901 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	    2430	    490985 ns/op	  586895 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	    2421	    488670 ns/op	  586898 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	    2419	    487473 ns/op	  586249 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	    2406	    489513 ns/op	  586686 B/op	   10014 allocs/op
Benchmark_Decode_LargeSlice_EscapedString_GoJson/Decode-12            	    2380	    493454 ns/op	  586254 B/op	   10014 allocs/op
  • Loading branch information
orisano committed Mar 25, 2023
1 parent 6f969b6 commit 02e2b6b
Show file tree
Hide file tree
Showing 6 changed files with 531 additions and 321 deletions.
25 changes: 18 additions & 7 deletions benchmarks/decode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -479,11 +479,22 @@ func Benchmark_Decode_LargeStruct_Stream_GoJsonFirstWinMode(b *testing.B) {
}

func Benchmark_Decode_LargeSlice_EscapedString_GoJson(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
var v []string
if err := gojson.Unmarshal(LargeSliceEscapedString, &v); err != nil {
b.Fatal(err)
}
}
b.Run("Unmarshal", func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
var v []string
if err := gojson.Unmarshal(LargeSliceEscapedString, &v); err != nil {
b.Fatal(err)
}
}
})
b.Run("Decode", func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
var v []string
if err := gojson.NewDecoder(bytes.NewReader(LargeSliceEscapedString)).Decode(&v); err != nil {
b.Fatal(err)
}
}
})
}
32 changes: 32 additions & 0 deletions decode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4050,3 +4050,35 @@ func TestIssue429(t *testing.T) {
}
}
}

func TestUnescapeString(t *testing.T) {
ts := []struct {
in string
out string
}{
{"\"\xff\"", "\xef\xbf\xbd"},
{`"\ud800\ud800"`, "\xef\xbf\xbd\xef\xbf\xbd"},
{`"\ud800\ud800\udc00"`, "\xef\xbf\xbd𐀀"},
{"\"\xef\xbf\xbd\"", "\xef\xbf\xbd"},
{"\"\xff\xff\xff\"", "\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd"},
{"\"\xef\"", "\xef\xbf\xbdあ"},
}
for _, tc := range ts {
var s string
{
err := json.Unmarshal([]byte(tc.in), &s)
assertErr(t, err)
assertEq(t, "escape string", tc.out, s)
}
{
err := json.NewDecoder(strings.NewReader(tc.in)).Decode(&s)
assertErr(t, err)
assertEq(t, "escape string", tc.out, s)
}
{
err := stdjson.Unmarshal([]byte(tc.in), &s)
assertErr(t, err)
assertEq(t, "escape string", tc.out, s)
}
}
}
4 changes: 4 additions & 0 deletions internal/decoder/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ func char(ptr unsafe.Pointer, offset int64) byte {
return *(*byte)(unsafe.Pointer(uintptr(ptr) + uintptr(offset)))
}

func ptrUint16(ptr unsafe.Pointer, offset int64) *uint16 {
return (*uint16)(unsafe.Pointer(uintptr(ptr) + uintptr(offset)))
}

func skipWhiteSpace(buf []byte, cursor int64) int64 {
for isWhiteSpace[buf[cursor]] {
cursor++
Expand Down
26 changes: 6 additions & 20 deletions internal/decoder/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -231,27 +231,13 @@ func (d *interfaceDecoder) decodeStreamEmptyInterface(s *Stream, depth int64, p
case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
return d.numDecoder(s).DecodeStream(s, depth, p)
case '"':
s.cursor++
start := s.cursor
for {
switch s.char() {
case '\\':
if _, err := decodeEscapeString(s, nil); err != nil {
return err
}
case '"':
literal := s.buf[start:s.cursor]
s.cursor++
*(*interface{})(p) = string(literal)
return nil
case nul:
if s.read() {
continue
}
return errors.ErrUnexpectedEndOfJSON("string", s.totalOffset())
}
s.cursor++
b, cursor, err := stringBytes(s)
s.cursor = cursor
if err != nil {
return err
}
*(*interface{})(p) = string(b)
return nil
case 't':
if err := trueBytes(s); err != nil {
return err
Expand Down
155 changes: 82 additions & 73 deletions internal/decoder/stream.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,26 @@ const (
)

type Stream struct {
buf []byte
bufSize int64
length int64
r io.Reader
offset int64
cursor int64
filledBuffer bool
allRead bool
// r は下位のリーダー
r io.Reader
// buf は r から読み込んだバッファしているバイト列
// 末尾は nul であることが保証されている
// バイト列が格納されているのは bufSize-1 バイト
buf []byte
// length は buf の有効なバイトが格納されているバイト数, buf[length] は nul である
length int64
// bufSize はバッファのサイズ
// 初期値は 512
bufSize int64
// cursor は現時点で処理している buf のインデックス
cursor int64
// offset は buf 先頭のストリーム全体におけるオフセット
offset int64
// filledBuffer は buf の中身がすべて有効なバイト列の場合 true になる
filledBuffer bool
// allRead は r から1度でも io.EOF が返されたら true になる
allRead bool

UseNumber bool
DisallowUnknownFields bool
Option *Option
Expand All @@ -41,6 +53,7 @@ func (s *Stream) TotalOffset() int64 {
return s.totalOffset()
}

// Buffered は encoding/json.Decoder との互換性のために提供されている
func (s *Stream) Buffered() io.Reader {
buflen := int64(len(s.buf))
for i := s.cursor; i < buflen; i++ {
Expand Down Expand Up @@ -71,6 +84,7 @@ func (s *Stream) PrepareForDecode() error {
return nil
}

// totalOffset はストリーム全体におけるオフセット
func (s *Stream) totalOffset() int64 {
return s.offset + s.cursor
}
Expand Down Expand Up @@ -103,7 +117,6 @@ func (s *Stream) statForRetry() ([]byte, int64, unsafe.Pointer) {

func (s *Stream) Reset() {
s.reset()
s.bufSize = int64(len(s.buf))
}

func (s *Stream) More() bool {
Expand Down Expand Up @@ -148,7 +161,8 @@ func (s *Stream) Token() (interface{}, error) {
}
return f64, nil
case '"':
bytes, err := stringBytes(s)
bytes, cursor, err := stringBytes(s)
s.cursor = cursor
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -181,40 +195,39 @@ END:
return nil, io.EOF
}

// reset は offset を更新し、buf の先頭を更新する。
// 既存の cursor と bufptr は失効する
func (s *Stream) reset() {
s.offset += s.cursor
s.buf = s.buf[s.cursor:]
s.buf = s.buf[s.cursor:] // MEMO: buf を使いまわしてしまう
s.length -= s.cursor
s.cursor = 0
}

// readBuf はバッファ先のバイトスライスを返す。
// buf, bufSize が更新される。
func (s *Stream) readBuf() []byte {
// 直前の read で buf がすべて有効なバイト列の場合、バッファサイズを2倍にしてもとの buf をコピーする
if s.filledBuffer {
// TODO: bufSize の上限を設定しておくべき
s.bufSize *= 2
remainBuf := s.buf
s.buf = make([]byte, s.bufSize)
copy(s.buf, remainBuf)
}
remainLen := s.length - s.cursor
remainNotNulCharNum := int64(0)
for i := int64(0); i < remainLen; i++ {
if s.buf[s.cursor+i] == nul {
break
}
remainNotNulCharNum++
}
s.length = s.cursor + remainNotNulCharNum
return s.buf[s.cursor+remainNotNulCharNum:]
return s.buf[s.length:]
}

// read は buf にバイト列を読み込む
// 下位のリーダーからエラーが返ってきた、もしくは allRead の状態で呼び出すと false を返す
func (s *Stream) read() bool {
if s.allRead {
return false
}
buf := s.readBuf()
last := len(buf) - 1
buf[last] = nul
n, err := s.r.Read(buf[:last])
buf[n] = nul
s.length += int64(n)
if n == last {
s.filledBuffer = true
Expand All @@ -229,6 +242,31 @@ func (s *Stream) read() bool {
return true
}

// requires は与えられた cursor から n バイト有効なバイトが buf に存在するまで read を繰り返します
// 戻り値は read を呼び出した回数です。 read に失敗した場合は負の値が返ります
func (s *Stream) requires(cursor, n int64) (read int) {
RETRY:
// MEMO: remain を使っていないのは inline 化のため
if s.length-cursor < n {
if !s.read() {
return -1
}
read++
goto RETRY
}
return
}

// syncBufptr は requires と組み合わせて使うことを前提とした bufptr を同期するための関数
// r には requires の戻り値を渡す必要があります
// 一度でも read に成功していると bufptr を更新します
func (s *Stream) syncBufptr(r int, p *unsafe.Pointer) int {
if r > 0 {
*p = s.bufptr()
}
return r
}

func (s *Stream) skipWhiteSpace() byte {
p := s.bufptr()
LOOP:
Expand Down Expand Up @@ -457,100 +495,71 @@ func (s *Stream) skipValue(depth int64) error {
}

func nullBytes(s *Stream) error {
if s.requires(s.cursor, 4) < 0 {
s.cursor = s.length
return errors.ErrUnexpectedEndOfJSON("null", s.cursor)
}
// current cursor's character is 'n'
s.cursor++
if s.char() != 'u' {
if err := retryReadNull(s); err != nil {
return err
}
return errors.ErrInvalidCharacter(s.char(), "null", s.totalOffset())
}
s.cursor++
if s.char() != 'l' {
if err := retryReadNull(s); err != nil {
return err
}
return errors.ErrInvalidCharacter(s.char(), "null", s.totalOffset())
}
s.cursor++
if s.char() != 'l' {
if err := retryReadNull(s); err != nil {
return err
}
return errors.ErrInvalidCharacter(s.char(), "null", s.totalOffset())
}
s.cursor++
return nil
}

func retryReadNull(s *Stream) error {
if s.char() == nul && s.read() {
return nil
}
return errors.ErrInvalidCharacter(s.char(), "null", s.totalOffset())
}

func trueBytes(s *Stream) error {
if s.requires(s.cursor, 4) < 0 {
s.cursor = s.length
return errors.ErrUnexpectedEndOfJSON("bool(true)", s.cursor)
}
// current cursor's character is 't'
s.cursor++
if s.char() != 'r' {
if err := retryReadTrue(s); err != nil {
return err
}
return errors.ErrInvalidCharacter(s.char(), "bool(true)", s.totalOffset())
}
s.cursor++
if s.char() != 'u' {
if err := retryReadTrue(s); err != nil {
return err
}
return errors.ErrInvalidCharacter(s.char(), "bool(true)", s.totalOffset())
}
s.cursor++
if s.char() != 'e' {
if err := retryReadTrue(s); err != nil {
return err
}
return errors.ErrInvalidCharacter(s.char(), "bool(true)", s.totalOffset())
}
s.cursor++
return nil
}

func retryReadTrue(s *Stream) error {
if s.char() == nul && s.read() {
return nil
}
return errors.ErrInvalidCharacter(s.char(), "bool(true)", s.totalOffset())
}

func falseBytes(s *Stream) error {
if s.requires(s.cursor, 5) < 0 {
s.cursor = s.length
return errors.ErrUnexpectedEndOfJSON("bool(false)", s.cursor)
}
// current cursor's character is 'f'
s.cursor++
if s.char() != 'a' {
if err := retryReadFalse(s); err != nil {
return err
}
return errors.ErrInvalidCharacter(s.char(), "bool(false)", s.totalOffset())
}
s.cursor++
if s.char() != 'l' {
if err := retryReadFalse(s); err != nil {
return err
}
return errors.ErrInvalidCharacter(s.char(), "bool(false)", s.totalOffset())
}
s.cursor++
if s.char() != 's' {
if err := retryReadFalse(s); err != nil {
return err
}
return errors.ErrInvalidCharacter(s.char(), "bool(false)", s.totalOffset())
}
s.cursor++
if s.char() != 'e' {
if err := retryReadFalse(s); err != nil {
return err
}
return errors.ErrInvalidCharacter(s.char(), "bool(false)", s.totalOffset())
}
s.cursor++
return nil
}

func retryReadFalse(s *Stream) error {
if s.char() == nul && s.read() {
return nil
}
return errors.ErrInvalidCharacter(s.char(), "bool(false)", s.totalOffset())
}

0 comments on commit 02e2b6b

Please sign in to comment.