From 22edcf6a68a057ed04368d5f78c8ba3ddfee8d57 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Sat, 25 Feb 2023 11:11:36 -0700 Subject: reencode: Improve the error messages for trailing partial-UTF-8 --- ReleaseNotes.md | 4 +++ .../json/testdata/fuzz/FuzzEquiv/95640f7d88708118 | 2 ++ reencode.go | 18 ++++++++-- reencode_test.go | 39 +++++++++++++++++++++- 4 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/95640f7d88708118 diff --git a/ReleaseNotes.md b/ReleaseNotes.md index c9d1233..71973aa 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -33,6 +33,10 @@ `.WriteString`. This only comes up if there is an I/O causing a partial write. + - Bugfix: ReEncoder: The error messages for trailing partial UTF-8 + now reflect the `InvalidUTF8` setting, rather than simply saying + "unflushed unicode garbage". + # v0.3.7 (2023-02-20) Theme: Fixes from fuzzing (part 1?) diff --git a/compat/json/testdata/fuzz/FuzzEquiv/95640f7d88708118 b/compat/json/testdata/fuzz/FuzzEquiv/95640f7d88708118 new file mode 100644 index 0000000..77924f3 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/95640f7d88708118 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\xf0") diff --git a/reencode.go b/reencode.go index fd36875..c19e296 100644 --- a/reencode.go +++ b/reencode.go @@ -329,9 +329,21 @@ func (enc *ReEncoder) WriteRune(c rune) (n int, err error) { // if enc.AllowMultipleValues is set. func (enc *ReEncoder) Close() error { if enc.bufLen > 0 { - return &ReEncodeSyntaxError{ - Offset: enc.inputPos, - Err: fmt.Errorf("%w: unflushed unicode garbage: %q", io.ErrUnexpectedEOF, enc.buf[:enc.bufLen]), + if enc.utf == InvalidUTF8Error { + return &ReEncodeSyntaxError{ + Offset: enc.inputPos, + Err: fmt.Errorf("truncated UTF-8: %q", enc.buf[:enc.bufLen]), + } + } + for i := 0; i < enc.bufLen; i++ { + if enc.utf == InvalidUTF8Replace { + enc.handleRune(utf8.RuneError, 1, true) + } else { + enc.handleRune(rune(enc.buf[i]), 1, false) + } + if enc.err != nil { + return enc.err + } } } if _, err := enc.par.HandleEOF(); err != nil { diff --git a/reencode_test.go b/reencode_test.go index feabde5..60180c8 100644 --- a/reencode_test.go +++ b/reencode_test.go @@ -15,7 +15,7 @@ import ( "git.lukeshu.com/go/lowmemjson/internal/fastio" ) -func TestReEncode(t *testing.T) { +func TestEncodeReEncode(t *testing.T) { t.Parallel() type testcase struct { enc ReEncoderConfig @@ -168,6 +168,43 @@ func TestReEncode(t *testing.T) { } } +func TestReEncode(t *testing.T) { + t.Parallel() + type testcase struct { + Cfg ReEncoderConfig + In string + ExpOut string + ExpWriteErr string + ExpCloseErr string + } + testcases := map[string]testcase{ + "partial-utf8-replace": {Cfg: ReEncoderConfig{InvalidUTF8: InvalidUTF8Replace}, In: "\xf0\xbf", ExpOut: ``, ExpCloseErr: "json: syntax error at input byte 0: invalid character '\uFFFD' looking for beginning of value"}, + "partial-utf8-preserve": {Cfg: ReEncoderConfig{InvalidUTF8: InvalidUTF8Preserve}, In: "\xf0\xbf", ExpOut: ``, ExpCloseErr: `json: syntax error at input byte 0: invalid character '\xf0' looking for beginning of value`}, + "partial-utf8-error": {Cfg: ReEncoderConfig{InvalidUTF8: InvalidUTF8Error}, In: "\xf0\xbf", ExpOut: ``, ExpCloseErr: `json: syntax error at input byte 0: truncated UTF-8: "\xf0\xbf"`}, + } + for tcName, tc := range testcases { + tc := tc + t.Run(tcName, func(t *testing.T) { + t.Parallel() + var out strings.Builder + enc := NewReEncoder(&out, tc.Cfg) + _, err := enc.WriteString(tc.In) + assert.Equal(t, tc.ExpOut, out.String()) + if tc.ExpWriteErr == "" { + assert.NoError(t, err) + } else { + assert.EqualError(t, err, tc.ExpWriteErr) + } + err = enc.Close() + if tc.ExpCloseErr == "" { + assert.NoError(t, err) + } else { + assert.EqualError(t, err, tc.ExpCloseErr) + } + }) + } +} + func TestReEncodeWriteSize(t *testing.T) { t.Parallel() -- cgit v1.2.3