From 051f966039028d257f27fc3a42c10cbff9f7c738 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Thu, 23 Feb 2023 21:30:12 -0700 Subject: decode: Include the invalid UTF-8 byte in error messages --- ReleaseNotes.md | 4 ++ compat/json/compat.go | 33 ++++++++++++++-- compat/json/compat_test.go | 46 ++++++++++++---------- .../json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b | 2 + decode_scan.go | 15 +++++++ 5 files changed, 75 insertions(+), 25 deletions(-) create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 48982e4..af2adcc 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -14,6 +14,10 @@ then the first type error encountered is returned. This is consistent with the behavior of `encoding/json`. + - Bugfix: Decoder: If there is a syntax error in a byte that + invalid UTF-8, include that byte value in the error message + rather than including the U+FFFD Unicode replacement character. + # v0.3.7 (2023-02-20) Theme: Fixes from fuzzing (part 1?) diff --git a/compat/json/compat.go b/compat/json/compat.go index c2d47c0..6f13fbb 100644 --- a/compat/json/compat.go +++ b/compat/json/compat.go @@ -329,7 +329,10 @@ func Unmarshal(data []byte, ptr any) error { } type teeRuneScanner struct { - src io.RuneScanner + src interface { + io.RuneScanner + io.ByteScanner + } dst *bytes.Buffer lastSize int } @@ -337,11 +340,14 @@ type teeRuneScanner struct { func (tee *teeRuneScanner) ReadRune() (r rune, size int, err error) { r, size, err = tee.src.ReadRune() if err == nil { - if _, err := tee.dst.WriteRune(r); err != nil { - return 0, 0, err + if r == utf8.RuneError && size == 1 { + _ = tee.src.UnreadRune() + b, _ := tee.src.ReadByte() + _ = tee.dst.WriteByte(b) + } else { + _, _ = tee.dst.WriteRune(r) } } - tee.lastSize = size return } @@ -356,6 +362,25 @@ func (tee *teeRuneScanner) UnreadRune() error { return nil } +func (tee *teeRuneScanner) ReadByte() (b byte, err error) { + b, err = tee.src.ReadByte() + if err == nil { + _ = tee.dst.WriteByte(b) + tee.lastSize = 1 + } + return +} + +func (tee *teeRuneScanner) UnreadByte() error { + if tee.lastSize != 1 { + return lowmemjson.ErrInvalidUnreadRune + } + _ = tee.src.UnreadByte() + tee.dst.Truncate(tee.dst.Len() - tee.lastSize) + tee.lastSize = 0 + return nil +} + type Decoder struct { validatorBuf *bufio.Reader validator *lowmemjson.Decoder diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index 098ac85..6aab103 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -72,13 +72,14 @@ func TestCompatCompact(t *testing.T) { Err string } testcases := map[string]testcase{ - "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, - "object": {In: `{}`, Out: `{}`}, - "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, - "float": {In: `1.200e003`, Out: `1.200e003`}, - "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`}, - "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`}, - "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`}, + "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, + "object": {In: `{}`, Out: `{}`}, + "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, + "float": {In: `1.200e003`, Out: `1.200e003`}, + "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`}, + "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`}, + "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`}, + "invalid-utf8": {In: "\x85", Err: `invalid character '\u0085' looking for beginning of value`}, } for tcName, tc := range testcases { tc := tc @@ -105,20 +106,21 @@ func TestCompatIndent(t *testing.T) { Err string } testcases := map[string]testcase{ - "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, - "object": {In: `{}`, Out: `{}`}, - "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, - "float": {In: `1.200e003`, Out: `1.200e003`}, - "tailws0": {In: `0`, Out: `0`}, - "tailws1": {In: `0 `, Out: `0 `}, - "tailws2": {In: `0 `, Out: `0 `}, - "tailws3": {In: "0\n", Out: "0\n"}, - "headws1": {In: ` 0`, Out: `0`}, - "objws1": {In: `{"a" : 1}`, Out: "{\n>.\"a\": 1\n>}"}, - "objws2": {In: "{\"a\"\n:\n1}", Out: "{\n>.\"a\": 1\n>}"}, - "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`}, - "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`}, - "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`}, + "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, + "object": {In: `{}`, Out: `{}`}, + "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, + "float": {In: `1.200e003`, Out: `1.200e003`}, + "tailws0": {In: `0`, Out: `0`}, + "tailws1": {In: `0 `, Out: `0 `}, + "tailws2": {In: `0 `, Out: `0 `}, + "tailws3": {In: "0\n", Out: "0\n"}, + "headws1": {In: ` 0`, Out: `0`}, + "objws1": {In: `{"a" : 1}`, Out: "{\n>.\"a\": 1\n>}"}, + "objws2": {In: "{\"a\"\n:\n1}", Out: "{\n>.\"a\": 1\n>}"}, + "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`}, + "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`}, + "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`}, + "invalid-utf8": {In: "\x85", Err: `invalid character '\u0085' looking for beginning of value`}, } for tcName, tc := range testcases { tc := tc @@ -181,6 +183,7 @@ func TestCompatUnmarshal(t *testing.T) { "two-objs": {In: `{} {}`, ExpOut: nil, ExpErr: `invalid character '{' after top-level value`}, "two-numbers1": {In: `00`, ExpOut: nil, ExpErr: `invalid character '0' after top-level value`}, "two-numbers2": {In: `1 2`, ExpOut: nil, ExpErr: `invalid character '2' after top-level value`}, + "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\u0085' looking for beginning of value`}, // 2e308 is slightly more than math.MaxFloat64 (~1.79e308) "obj-overflow": {In: `{"foo":"bar", "baz":2e308, "qux": "orb"}`, ExpOut: map[string]any{"foo": "bar", "baz": nil, "qux": "orb"}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, "ary-overflow": {In: `["foo",2e308,"bar",3e308]`, ExpOut: []any{"foo", nil, "bar", nil}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, @@ -223,6 +226,7 @@ func TestCompatDecode(t *testing.T) { "two-objs": {In: `{} {}`, ExpOut: map[string]any{}}, "two-numbers1": {In: `00`, ExpOut: float64(0)}, "two-numbers2": {In: `1 2`, ExpOut: float64(1)}, + "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\u0085' looking for beginning of value`}, // 2e308 is slightly more than math.MaxFloat64 (~1.79e308) "obj-overflow": {In: `{"foo":"bar", "baz":2e308, "qux": "orb"}`, ExpOut: map[string]any{"foo": "bar", "baz": nil, "qux": "orb"}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, "ary-overflow": {In: `["foo",2e308,"bar",3e308]`, ExpOut: []any{"foo", nil, "bar", nil}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, diff --git a/compat/json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b b/compat/json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b new file mode 100644 index 0000000..bb8752b --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\x85") diff --git a/decode_scan.go b/decode_scan.go index 63694c4..940de49 100644 --- a/decode_scan.go +++ b/decode_scan.go @@ -6,6 +6,7 @@ package lowmemjson import ( "io" + "unicode/utf8" "git.lukeshu.com/go/lowmemjson/internal/jsonparse" ) @@ -55,6 +56,17 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error) sc.offset += int64(sc.rSize) switch err { case nil: + invalidUTF8 := false + if sc.rRune == utf8.RuneError && sc.rSize == 1 { + if bs, ok := sc.inner.(io.ByteScanner); ok { + _ = bs.UnreadByte() // UnreadRune doesn't back up the ReadByte-pos + b, _ := bs.ReadByte() + _ = bs.UnreadByte() + _, _, _ = sc.inner.ReadRune() + sc.rRune = rune(b) + invalidUTF8 = true + } + } sc.rType, err = sc.parser.HandleRune(sc.rRune) if err != nil { sc.rErr = &DecodeSyntaxError{ @@ -62,6 +74,9 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error) Err: err, } } else { + if invalidUTF8 { + sc.rRune = utf8.RuneError + } sc.rErr = nil } switch sc.rType { -- cgit v1.2.3