From a4ffa83919375653415b670fdbf136e29c1b1c2c Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Thu, 23 Feb 2023 22:08:27 -0700 Subject: compat/json: Unify the error conversion --- compat/json/compat.go | 168 ++++++++++++++++++++++++++++---------------------- 1 file changed, 93 insertions(+), 75 deletions(-) diff --git a/compat/json/compat.go b/compat/json/compat.go index 695c1a8..c2d47c0 100644 --- a/compat/json/compat.go +++ b/compat/json/compat.go @@ -40,22 +40,100 @@ type ( // MarshalerError = json.MarshalerError // Duplicated to access a private field. ) -// Encode wrappers /////////////////////////////////////////////////// +// Error conversion ////////////////////////////////////////////////// -func convertEncodeError(err error) error { - if me, ok := err.(*lowmemjson.EncodeMethodError); ok { - err = &MarshalerError{ - Type: me.Type, - Err: me.Err, - sourceFunc: me.SourceFunc, +func convertError(err error, isUnmarshal bool) error { + switch err := err.(type) { + case nil: + return nil + case *lowmemjson.DecodeArgumentError: + return err + case *lowmemjson.DecodeError: + switch suberr := err.Err.(type) { + case *lowmemjson.DecodeReadError: + return err + case *lowmemjson.DecodeSyntaxError: + switch { + case errors.Is(err, io.EOF): + return io.EOF + case errors.Is(err, io.ErrUnexpectedEOF) && isUnmarshal: + return &SyntaxError{ + msg: "unexpected end of JSON input", + Offset: suberr.Offset, + } + default: + return &SyntaxError{ + msg: suberr.Err.Error(), + Offset: suberr.Offset + 1, + } + } + case *lowmemjson.DecodeTypeError: + switch subsuberr := suberr.Err.(type) { + case *UnmarshalTypeError: + // Populate the .Struct and .Field members. + subsuberr.Struct = err.FieldParent + subsuberr.Field = err.FieldName + return subsuberr + default: + switch { + case errors.Is(err, lowmemjson.ErrDecodeNonEmptyInterface), + errors.Is(err, strconv.ErrSyntax), + errors.Is(err, strconv.ErrRange): + return &UnmarshalTypeError{ + Value: suberr.JSONType, + Type: suberr.GoType, + Offset: suberr.Offset, + Struct: err.FieldParent, + Field: err.FieldName, + } + default: + return subsuberr + } + case nil, *lowmemjson.DecodeArgumentError: + return &UnmarshalTypeError{ + Value: suberr.JSONType, + Type: suberr.GoType, + Offset: suberr.Offset, + Struct: err.FieldParent, + Field: err.FieldName, + } + } + default: + panic(fmt.Errorf("should not happen: unexpected lowmemjson.DecodeError sub-type: %T: %w", suberr, err)) + } + case *lowmemjson.EncodeWriteError: + return err + case *lowmemjson.EncodeTypeError: + return err + case *lowmemjson.EncodeValueError: + return err + case *lowmemjson.EncodeMethodError: + return &MarshalerError{ + Type: err.Type, + Err: err.Err, + sourceFunc: err.SourceFunc, + } + case *lowmemjson.ReEncodeWriteError: + return err + case *lowmemjson.ReEncodeSyntaxError: + ret := &SyntaxError{ + msg: err.Err.Error(), + Offset: err.Offset + 1, } + if errors.Is(err, io.ErrUnexpectedEOF) { + ret.msg = "unexpected end of JSON input" + } + return ret + default: + panic(fmt.Errorf("should not happen: unexpected lowmemjson error type: %T: %w", err, err)) } - return err } +// Encode wrappers /////////////////////////////////////////////////// + func marshal(v any, cfg lowmemjson.ReEncoderConfig) ([]byte, error) { var buf bytes.Buffer - if err := convertEncodeError(lowmemjson.NewEncoder(lowmemjson.NewReEncoder(&buf, cfg)).Encode(v)); err != nil { + if err := convertError(lowmemjson.NewEncoder(lowmemjson.NewReEncoder(&buf, cfg)).Encode(v), false); err != nil { return nil, err } return buf.Bytes(), nil @@ -105,7 +183,7 @@ func (enc *Encoder) refreshConfig() { } func (enc *Encoder) Encode(v any) error { - if err := convertEncodeError(enc.encoder.Encode(v)); err != nil { + if err := convertError(enc.encoder.Encode(v), false); err != nil { enc.buf.Reset() return err } @@ -133,19 +211,6 @@ func (enc *Encoder) SetIndent(prefix, indent string) { // ReEncode wrappers ///////////////////////////////////////////////// -func convertReEncodeError(err error) error { - if se, ok := err.(*lowmemjson.ReEncodeSyntaxError); ok { - err = &SyntaxError{ - msg: se.Err.Error(), - Offset: se.Offset + 1, - } - if errors.Is(se.Err, io.ErrUnexpectedEOF) { - err.(*SyntaxError).msg = "unexpected end of JSON input" - } - } - return err -} - func HTMLEscape(dst *bytes.Buffer, src []byte) { for n := 0; n < len(src); { c, size := utf8.DecodeRune(src[n:]) @@ -172,7 +237,7 @@ func reencode(dst io.Writer, src []byte, cfg lowmemjson.ReEncoderConfig) error { if err == nil { err = formatter.Close() } - return convertReEncodeError(err) + return convertError(err, false) } func Compact(dst *bytes.Buffer, src []byte) error { @@ -237,53 +302,6 @@ func Valid(data []byte) bool { // Decode wrappers /////////////////////////////////////////////////// -func convertDecodeError(err error, isUnmarshal bool) error { - if derr, ok := err.(*lowmemjson.DecodeError); ok { - switch terr := derr.Err.(type) { - case *lowmemjson.DecodeSyntaxError: - switch { - case errors.Is(terr.Err, io.EOF): - err = io.EOF - case errors.Is(terr.Err, io.ErrUnexpectedEOF) && isUnmarshal: - err = &SyntaxError{ - msg: "unexpected end of JSON input", - Offset: terr.Offset, - } - default: - err = &SyntaxError{ - msg: terr.Err.Error(), - Offset: terr.Offset + 1, - } - } - case *lowmemjson.DecodeTypeError: - if typeErr, ok := terr.Err.(*json.UnmarshalTypeError); ok { - err = &UnmarshalTypeError{ - Value: typeErr.Value, - Type: typeErr.Type, - Offset: typeErr.Offset, - Struct: derr.FieldParent, - Field: derr.FieldName, - } - } else if _, isArgErr := terr.Err.(*lowmemjson.DecodeArgumentError); terr.Err != nil && - !isArgErr && - !errors.Is(terr.Err, lowmemjson.ErrDecodeNonEmptyInterface) && - !errors.Is(terr.Err, strconv.ErrSyntax) && - !errors.Is(terr.Err, strconv.ErrRange) { - err = terr.Err - } else { - err = &UnmarshalTypeError{ - Value: terr.JSONType, - Type: terr.GoType, - Offset: terr.Offset, - Struct: derr.FieldParent, - Field: derr.FieldName, - } - } - } - } - return err -} - type decodeValidator struct{} func (*decodeValidator) DecodeJSON(r io.RuneScanner) error { @@ -301,10 +319,10 @@ func (*decodeValidator) DecodeJSON(r io.RuneScanner) error { var _ lowmemjson.Decodable = (*decodeValidator)(nil) func Unmarshal(data []byte, ptr any) error { - if err := convertDecodeError(lowmemjson.NewDecoder(bytes.NewReader(data)).DecodeThenEOF(&decodeValidator{}), true); err != nil { + if err := convertError(lowmemjson.NewDecoder(bytes.NewReader(data)).DecodeThenEOF(&decodeValidator{}), true); err != nil { return err } - if err := convertDecodeError(lowmemjson.NewDecoder(bytes.NewReader(data)).DecodeThenEOF(ptr), true); err != nil { + if err := convertError(lowmemjson.NewDecoder(bytes.NewReader(data)).DecodeThenEOF(ptr), true); err != nil { return err } return nil @@ -363,10 +381,10 @@ func NewDecoder(r io.Reader) *Decoder { } func (dec *Decoder) Decode(ptr any) error { - if err := convertDecodeError(dec.validator.Decode(&decodeValidator{}), false); err != nil { + if err := convertError(dec.validator.Decode(&decodeValidator{}), false); err != nil { return err } - if err := convertDecodeError(dec.Decoder.Decode(ptr), false); err != nil { + if err := convertError(dec.Decoder.Decode(ptr), false); err != nil { return err } return nil -- cgit v1.2.3 From 5c33db4970fcb35ae3823d61f9576fd65b5fcbdf Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 24 Feb 2023 20:51:49 -0700 Subject: decode: Touch up a panic error message --- decode.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/decode.go b/decode.go index d4ecd4f..44eaba4 100644 --- a/decode.go +++ b/decode.go @@ -316,7 +316,7 @@ func (dec *Decoder) expectRuneOrPanic(ec rune, et jsonparse.RuneType) *DecodeErr return err } if ac != ec || at != et { - panic(fmt.Errorf("should not happen: expected %q/%v but got %q/%v", ec, et, ac, at)) + panic(fmt.Errorf("should not happen: expected r=%q t=%#v but got r=%q t=%#v", ec, et, ac, at)) } return nil } -- cgit v1.2.3 From d35495540df2b6d3ba16c84ce21627d9dbae000c Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 10 Feb 2023 23:38:26 -0700 Subject: Fuzz for equivalence between stdlib and lowmemjson --- compat/json/equiv_test.go | 160 +++++++++++++++++++++ .../json/testdata/fuzz/FuzzEquiv/0064ebc3507e959b | 2 + .../json/testdata/fuzz/FuzzEquiv/19981bffc2abbaf1 | 2 + .../json/testdata/fuzz/FuzzEquiv/57365320c0968611 | 2 + .../json/testdata/fuzz/FuzzEquiv/5cd6893f25481dae | 2 + .../json/testdata/fuzz/FuzzEquiv/6a6612e05e0f9e32 | 2 + .../json/testdata/fuzz/FuzzEquiv/77e6e971d8684f84 | 2 + .../json/testdata/fuzz/FuzzEquiv/8727b16d337d7b81 | 2 + .../json/testdata/fuzz/FuzzEquiv/96aac43014471adc | 2 + .../json/testdata/fuzz/FuzzEquiv/9cc52906ed53ef5f | 2 + .../json/testdata/fuzz/FuzzEquiv/a0b9ecf4e99fd85d | 2 + .../json/testdata/fuzz/FuzzEquiv/a5775dd298b90a6c | 2 + .../json/testdata/fuzz/FuzzEquiv/af9bedcb9e0a31e8 | 2 + .../json/testdata/fuzz/FuzzEquiv/f6b0960dd3331a00 | 2 + .../json/testdata/fuzz/FuzzEquiv/fbbce5ea61559cc6 | 2 + .../json/testdata/fuzz/FuzzEquiv/fd29ccbb2af92d4f | 2 + 16 files changed, 190 insertions(+) create mode 100644 compat/json/equiv_test.go create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/0064ebc3507e959b create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/19981bffc2abbaf1 create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/57365320c0968611 create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/5cd6893f25481dae create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/6a6612e05e0f9e32 create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/77e6e971d8684f84 create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/8727b16d337d7b81 create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/96aac43014471adc create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/9cc52906ed53ef5f create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/a0b9ecf4e99fd85d create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/a5775dd298b90a6c create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/af9bedcb9e0a31e8 create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/f6b0960dd3331a00 create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/fbbce5ea61559cc6 create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/fd29ccbb2af92d4f diff --git a/compat/json/equiv_test.go b/compat/json/equiv_test.go new file mode 100644 index 0000000..246e4b3 --- /dev/null +++ b/compat/json/equiv_test.go @@ -0,0 +1,160 @@ +// Copyright (C) 2023 Luke Shumaker +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package json_test + +import ( + "bytes" + std "encoding/json" + "errors" + "io" + "strconv" + "strings" + "testing" + "unicode/utf8" + + "github.com/stretchr/testify/assert" + + low "git.lukeshu.com/go/lowmemjson/compat/json" +) + +func assertEquivErr(t *testing.T, stdErr, lowErr error) { + if (stdErr == nil) || (lowErr == nil) { + // Nil-equal. + assert.Equal(t, stdErr, lowErr) + return + } + switch stdErr.(type) { + case *std.SyntaxError: + if lowErr != nil { + stdMsg := stdErr.Error() + lowMsg := lowErr.Error() + + // https://github.com/golang/go/issues/58680 + if strings.HasPrefix(stdMsg, `invalid character ' ' `) && + (errors.Is(lowErr, io.ErrUnexpectedEOF) || lowMsg == "unexpected end of JSON input") { + return + } + + // https://github.com/golang/go/issues/58713 + prefix := `invalid character '` + if stdMsg != lowMsg && strings.HasPrefix(stdMsg, prefix) && strings.HasPrefix(lowMsg, prefix) { + stdRune, stdRuneSize := utf8.DecodeRuneInString(stdMsg[len(prefix):]) + lowByte := lowMsg[len(prefix)] + if lowByte == '\\' { + switch lowMsg[len(prefix)+1] { + case 'u': + lowRune, _ := strconv.ParseUint(lowMsg[len(prefix)+2:][:4], 16, 32) + var buf [4]byte + utf8.EncodeRune(buf[:], rune(lowRune)) + lowByte = buf[0] + case 'U': + lowRune, _ := strconv.ParseUint(lowMsg[len(prefix)+2:][:8], 16, 32) + var buf [4]byte + utf8.EncodeRune(buf[:], rune(lowRune)) + lowByte = buf[0] + } + } + if stdRune == rune(lowByte) { + lowRuneStr := lowMsg[len(prefix):] + lowRuneStr = lowRuneStr[:strings.IndexByte(lowRuneStr, '\'')] + stdMsg = prefix + lowRuneStr + stdMsg[len(prefix)+stdRuneSize:] + stdErr = errors.New(stdMsg) + } + } + } + // Text-equal. + assert.Equal(t, stdErr.Error(), lowErr.Error()) + // TODO: Assert that they are deep-equal (but be permissive of these not being type aliases). + case *std.MarshalerError: + // Text-equal. + assert.Equal(t, stdErr.Error(), lowErr.Error()) + // TODO: Assert that they are deep-equal (but be permissive of these not being type aliases). + default: + // Text-equal. + assert.Equal(t, stdErr.Error(), lowErr.Error()) + // TODO: Assert that they are deep-equal. + } +} + +func FuzzEquiv(f *testing.F) { + f.Fuzz(func(t *testing.T, str []byte) { + t.Logf("str=%q", str) + t.Run("HTMLEscape", func(t *testing.T) { + var stdOut bytes.Buffer + std.HTMLEscape(&stdOut, str) + + var lowOut bytes.Buffer + low.HTMLEscape(&lowOut, str) + + assert.Equal(t, stdOut.String(), lowOut.String()) + }) + t.Run("Compact", func(t *testing.T) { + var stdOut bytes.Buffer + stdErr := std.Compact(&stdOut, str) + + var lowOut bytes.Buffer + lowErr := low.Compact(&lowOut, str) + + assert.Equal(t, stdOut.String(), lowOut.String()) + assertEquivErr(t, stdErr, lowErr) + }) + t.Run("Indent", func(t *testing.T) { + var stdOut bytes.Buffer + stdErr := std.Indent(&stdOut, str, "ยป", "\t") + + var lowOut bytes.Buffer + lowErr := low.Indent(&lowOut, str, "ยป", "\t") + + assert.Equal(t, stdOut.String(), lowOut.String()) + assertEquivErr(t, stdErr, lowErr) + }) + t.Run("Valid", func(t *testing.T) { + stdValid := std.Valid(str) && utf8.Valid(str) // https://github.com/golang/go/issues/58517 + lowValid := low.Valid(str) + assert.Equal(t, stdValid, lowValid) + }) + t.Run("Decode-Encode", func(t *testing.T) { + var stdObj any + stdErr := std.NewDecoder(bytes.NewReader(str)).Decode(&stdObj) + + var lowObj any + lowErr := low.NewDecoder(bytes.NewReader(str)).Decode(&lowObj) + + assert.Equal(t, stdObj, lowObj) + assertEquivErr(t, stdErr, lowErr) + if t.Failed() { + return + } + + var stdOut bytes.Buffer + stdErr = std.NewEncoder(&stdOut).Encode(stdObj) + + var lowOut bytes.Buffer + lowErr = low.NewEncoder(&lowOut).Encode(lowObj) + + assert.Equal(t, stdOut.String(), lowOut.String()) + assertEquivErr(t, stdErr, lowErr) + }) + t.Run("Unmarshal-Marshal", func(t *testing.T) { + var stdObj any + stdErr := std.Unmarshal(str, &stdObj) + + var lowObj any + lowErr := low.Unmarshal(str, &lowObj) + + assert.Equal(t, stdObj, lowObj) + assertEquivErr(t, stdErr, lowErr) + if t.Failed() { + return + } + + stdOut, stdErr := std.Marshal(stdObj) + lowOut, lowErr := low.Marshal(lowObj) + + assert.Equal(t, string(stdOut), string(lowOut)) + assertEquivErr(t, stdErr, lowErr) + }) + }) +} diff --git a/compat/json/testdata/fuzz/FuzzEquiv/0064ebc3507e959b b/compat/json/testdata/fuzz/FuzzEquiv/0064ebc3507e959b new file mode 100644 index 0000000..96e9e53 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/0064ebc3507e959b @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("๐ ") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/19981bffc2abbaf1 b/compat/json/testdata/fuzz/FuzzEquiv/19981bffc2abbaf1 new file mode 100644 index 0000000..ecbe8af --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/19981bffc2abbaf1 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("A") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/57365320c0968611 b/compat/json/testdata/fuzz/FuzzEquiv/57365320c0968611 new file mode 100644 index 0000000..5aace7f --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/57365320c0968611 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("[200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000]") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/5cd6893f25481dae b/compat/json/testdata/fuzz/FuzzEquiv/5cd6893f25481dae new file mode 100644 index 0000000..a51778b --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/5cd6893f25481dae @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("0E00") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/6a6612e05e0f9e32 b/compat/json/testdata/fuzz/FuzzEquiv/6a6612e05e0f9e32 new file mode 100644 index 0000000..fe2e128 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/6a6612e05e0f9e32 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\"\\uD800\"") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/77e6e971d8684f84 b/compat/json/testdata/fuzz/FuzzEquiv/77e6e971d8684f84 new file mode 100644 index 0000000..e3c530f --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/77e6e971d8684f84 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\uebae") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/8727b16d337d7b81 b/compat/json/testdata/fuzz/FuzzEquiv/8727b16d337d7b81 new file mode 100644 index 0000000..e8000f3 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/8727b16d337d7b81 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("00") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/96aac43014471adc b/compat/json/testdata/fuzz/FuzzEquiv/96aac43014471adc new file mode 100644 index 0000000..9461c7a --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/96aac43014471adc @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\"\\") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/9cc52906ed53ef5f b/compat/json/testdata/fuzz/FuzzEquiv/9cc52906ed53ef5f new file mode 100644 index 0000000..1edfb06 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/9cc52906ed53ef5f @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\"") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/a0b9ecf4e99fd85d b/compat/json/testdata/fuzz/FuzzEquiv/a0b9ecf4e99fd85d new file mode 100644 index 0000000..b3c523c --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/a0b9ecf4e99fd85d @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("0.") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/a5775dd298b90a6c b/compat/json/testdata/fuzz/FuzzEquiv/a5775dd298b90a6c new file mode 100644 index 0000000..ca6f6f5 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/a5775dd298b90a6c @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\"\\u") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/af9bedcb9e0a31e8 b/compat/json/testdata/fuzz/FuzzEquiv/af9bedcb9e0a31e8 new file mode 100644 index 0000000..778cc61 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/af9bedcb9e0a31e8 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("0 ") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/f6b0960dd3331a00 b/compat/json/testdata/fuzz/FuzzEquiv/f6b0960dd3331a00 new file mode 100644 index 0000000..9644b51 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/f6b0960dd3331a00 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\"0\x85\xcd\xc0\xf3\xcb\xc1\xb3\xf2\xf5\xa4\xc1\xd40\xba\xe9\"") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/fbbce5ea61559cc6 b/compat/json/testdata/fuzz/FuzzEquiv/fbbce5ea61559cc6 new file mode 100644 index 0000000..712fab9 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/fbbce5ea61559cc6 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\U00054516") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/fd29ccbb2af92d4f b/compat/json/testdata/fuzz/FuzzEquiv/fd29ccbb2af92d4f new file mode 100644 index 0000000..9dc2675 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/fd29ccbb2af92d4f @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("ว‘") -- cgit v1.2.3 From 051f966039028d257f27fc3a42c10cbff9f7c738 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Thu, 23 Feb 2023 21:30:12 -0700 Subject: decode: Include the invalid UTF-8 byte in error messages --- ReleaseNotes.md | 4 ++ compat/json/compat.go | 33 ++++++++++++++-- compat/json/compat_test.go | 46 ++++++++++++---------- .../json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b | 2 + decode_scan.go | 15 +++++++ 5 files changed, 75 insertions(+), 25 deletions(-) create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 48982e4..af2adcc 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -14,6 +14,10 @@ then the first type error encountered is returned. This is consistent with the behavior of `encoding/json`. + - Bugfix: Decoder: If there is a syntax error in a byte that + invalid UTF-8, include that byte value in the error message + rather than including the U+FFFD Unicode replacement character. + # v0.3.7 (2023-02-20) Theme: Fixes from fuzzing (part 1?) diff --git a/compat/json/compat.go b/compat/json/compat.go index c2d47c0..6f13fbb 100644 --- a/compat/json/compat.go +++ b/compat/json/compat.go @@ -329,7 +329,10 @@ func Unmarshal(data []byte, ptr any) error { } type teeRuneScanner struct { - src io.RuneScanner + src interface { + io.RuneScanner + io.ByteScanner + } dst *bytes.Buffer lastSize int } @@ -337,11 +340,14 @@ type teeRuneScanner struct { func (tee *teeRuneScanner) ReadRune() (r rune, size int, err error) { r, size, err = tee.src.ReadRune() if err == nil { - if _, err := tee.dst.WriteRune(r); err != nil { - return 0, 0, err + if r == utf8.RuneError && size == 1 { + _ = tee.src.UnreadRune() + b, _ := tee.src.ReadByte() + _ = tee.dst.WriteByte(b) + } else { + _, _ = tee.dst.WriteRune(r) } } - tee.lastSize = size return } @@ -356,6 +362,25 @@ func (tee *teeRuneScanner) UnreadRune() error { return nil } +func (tee *teeRuneScanner) ReadByte() (b byte, err error) { + b, err = tee.src.ReadByte() + if err == nil { + _ = tee.dst.WriteByte(b) + tee.lastSize = 1 + } + return +} + +func (tee *teeRuneScanner) UnreadByte() error { + if tee.lastSize != 1 { + return lowmemjson.ErrInvalidUnreadRune + } + _ = tee.src.UnreadByte() + tee.dst.Truncate(tee.dst.Len() - tee.lastSize) + tee.lastSize = 0 + return nil +} + type Decoder struct { validatorBuf *bufio.Reader validator *lowmemjson.Decoder diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index 098ac85..6aab103 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -72,13 +72,14 @@ func TestCompatCompact(t *testing.T) { Err string } testcases := map[string]testcase{ - "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, - "object": {In: `{}`, Out: `{}`}, - "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, - "float": {In: `1.200e003`, Out: `1.200e003`}, - "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`}, - "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`}, - "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`}, + "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, + "object": {In: `{}`, Out: `{}`}, + "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, + "float": {In: `1.200e003`, Out: `1.200e003`}, + "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`}, + "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`}, + "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`}, + "invalid-utf8": {In: "\x85", Err: `invalid character '\u0085' looking for beginning of value`}, } for tcName, tc := range testcases { tc := tc @@ -105,20 +106,21 @@ func TestCompatIndent(t *testing.T) { Err string } testcases := map[string]testcase{ - "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, - "object": {In: `{}`, Out: `{}`}, - "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, - "float": {In: `1.200e003`, Out: `1.200e003`}, - "tailws0": {In: `0`, Out: `0`}, - "tailws1": {In: `0 `, Out: `0 `}, - "tailws2": {In: `0 `, Out: `0 `}, - "tailws3": {In: "0\n", Out: "0\n"}, - "headws1": {In: ` 0`, Out: `0`}, - "objws1": {In: `{"a" : 1}`, Out: "{\n>.\"a\": 1\n>}"}, - "objws2": {In: "{\"a\"\n:\n1}", Out: "{\n>.\"a\": 1\n>}"}, - "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`}, - "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`}, - "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`}, + "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, + "object": {In: `{}`, Out: `{}`}, + "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, + "float": {In: `1.200e003`, Out: `1.200e003`}, + "tailws0": {In: `0`, Out: `0`}, + "tailws1": {In: `0 `, Out: `0 `}, + "tailws2": {In: `0 `, Out: `0 `}, + "tailws3": {In: "0\n", Out: "0\n"}, + "headws1": {In: ` 0`, Out: `0`}, + "objws1": {In: `{"a" : 1}`, Out: "{\n>.\"a\": 1\n>}"}, + "objws2": {In: "{\"a\"\n:\n1}", Out: "{\n>.\"a\": 1\n>}"}, + "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`}, + "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`}, + "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`}, + "invalid-utf8": {In: "\x85", Err: `invalid character '\u0085' looking for beginning of value`}, } for tcName, tc := range testcases { tc := tc @@ -181,6 +183,7 @@ func TestCompatUnmarshal(t *testing.T) { "two-objs": {In: `{} {}`, ExpOut: nil, ExpErr: `invalid character '{' after top-level value`}, "two-numbers1": {In: `00`, ExpOut: nil, ExpErr: `invalid character '0' after top-level value`}, "two-numbers2": {In: `1 2`, ExpOut: nil, ExpErr: `invalid character '2' after top-level value`}, + "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\u0085' looking for beginning of value`}, // 2e308 is slightly more than math.MaxFloat64 (~1.79e308) "obj-overflow": {In: `{"foo":"bar", "baz":2e308, "qux": "orb"}`, ExpOut: map[string]any{"foo": "bar", "baz": nil, "qux": "orb"}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, "ary-overflow": {In: `["foo",2e308,"bar",3e308]`, ExpOut: []any{"foo", nil, "bar", nil}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, @@ -223,6 +226,7 @@ func TestCompatDecode(t *testing.T) { "two-objs": {In: `{} {}`, ExpOut: map[string]any{}}, "two-numbers1": {In: `00`, ExpOut: float64(0)}, "two-numbers2": {In: `1 2`, ExpOut: float64(1)}, + "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\u0085' looking for beginning of value`}, // 2e308 is slightly more than math.MaxFloat64 (~1.79e308) "obj-overflow": {In: `{"foo":"bar", "baz":2e308, "qux": "orb"}`, ExpOut: map[string]any{"foo": "bar", "baz": nil, "qux": "orb"}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, "ary-overflow": {In: `["foo",2e308,"bar",3e308]`, ExpOut: []any{"foo", nil, "bar", nil}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, diff --git a/compat/json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b b/compat/json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b new file mode 100644 index 0000000..bb8752b --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/9e35149f0eb0866b @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\x85") diff --git a/decode_scan.go b/decode_scan.go index 63694c4..940de49 100644 --- a/decode_scan.go +++ b/decode_scan.go @@ -6,6 +6,7 @@ package lowmemjson import ( "io" + "unicode/utf8" "git.lukeshu.com/go/lowmemjson/internal/jsonparse" ) @@ -55,6 +56,17 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error) sc.offset += int64(sc.rSize) switch err { case nil: + invalidUTF8 := false + if sc.rRune == utf8.RuneError && sc.rSize == 1 { + if bs, ok := sc.inner.(io.ByteScanner); ok { + _ = bs.UnreadByte() // UnreadRune doesn't back up the ReadByte-pos + b, _ := bs.ReadByte() + _ = bs.UnreadByte() + _, _, _ = sc.inner.ReadRune() + sc.rRune = rune(b) + invalidUTF8 = true + } + } sc.rType, err = sc.parser.HandleRune(sc.rRune) if err != nil { sc.rErr = &DecodeSyntaxError{ @@ -62,6 +74,9 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error) Err: err, } } else { + if invalidUTF8 { + sc.rRune = utf8.RuneError + } sc.rErr = nil } switch sc.rType { -- cgit v1.2.3 From e38edfa53173c054ff97a5c51f90df0da60f16f5 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Tue, 14 Feb 2023 18:55:57 -0700 Subject: jsonparse: Reword error messages to match encoding/json --- ReleaseNotes.md | 3 +++ compat/json/compat_test.go | 18 ++++++++++++++++++ compat/json/testdata/fuzz/FuzzEquiv/1071d2f6e5b5f7d3 | 2 ++ compat/json/testdata/fuzz/FuzzEquiv/6bced2300496f15c | 2 ++ compat/json/testdata/fuzz/FuzzEquiv/6daf246742074967 | 2 ++ compat/json/testdata/fuzz/FuzzEquiv/7c3168c77fc059cb | 2 ++ compat/json/testdata/fuzz/FuzzEquiv/a955c588d78b5c3a | 2 ++ compat/json/testdata/fuzz/FuzzEquiv/cf667c6f1f3282c1 | 2 ++ compat/json/testdata/fuzz/FuzzEquiv/ef2c8755a89034da | 2 ++ internal/jsonparse/parse.go | 20 ++++++++++---------- methods_test.go | 4 ++-- 11 files changed, 47 insertions(+), 12 deletions(-) create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/1071d2f6e5b5f7d3 create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/6bced2300496f15c create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/6daf246742074967 create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/7c3168c77fc059cb create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/a955c588d78b5c3a create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/cf667c6f1f3282c1 create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/ef2c8755a89034da diff --git a/ReleaseNotes.md b/ReleaseNotes.md index af2adcc..1981678 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -14,6 +14,9 @@ then the first type error encountered is returned. This is consistent with the behavior of `encoding/json`. + - Change: Several error strings have been reworded to match + `encoding/json`. + - Bugfix: Decoder: If there is a syntax error in a byte that invalid UTF-8, include that byte value in the error message rather than including the U+FFFD Unicode replacement character. diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index 6aab103..cf9e359 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -188,6 +188,24 @@ func TestCompatUnmarshal(t *testing.T) { "obj-overflow": {In: `{"foo":"bar", "baz":2e308, "qux": "orb"}`, ExpOut: map[string]any{"foo": "bar", "baz": nil, "qux": "orb"}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, "ary-overflow": {In: `["foo",2e308,"bar",3e308]`, ExpOut: []any{"foo", nil, "bar", nil}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, "existing-overflow": {In: `2e308`, InPtr: func() any { x := 4; return &x }(), ExpOut: 4, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type int`}, + // syntax error messages + "syntax-01": {In: `{}x`, ExpErr: `invalid character 'x' after top-level value`}, + "syntax-02": {In: `x`, ExpErr: `invalid character 'x' looking for beginning of value`}, + "syntax-03": {In: `{x`, ExpErr: `invalid character 'x' looking for beginning of object key string`}, + "syntax-04": {In: `{""x`, ExpErr: `invalid character 'x' after object key`}, + "syntax-05": {In: `{"":0x`, ExpErr: `invalid character 'x' after object key:value pair`}, + "syntax-06": {In: `[0x`, ExpErr: `invalid character 'x' after array element`}, + "syntax-07": {In: "\"\x01\"", ExpErr: `invalid character '\x01' in string literal`}, + "syntax-08": {In: `"\x`, ExpErr: `invalid character 'x' in string escape code`}, + "syntax-09": {In: `"\ux`, ExpErr: `invalid character 'x' in \u hexadecimal character escape`}, + "syntax-10": {In: `"\u0x`, ExpErr: `invalid character 'x' in \u hexadecimal character escape`}, + "syntax-11": {In: `"\u00x`, ExpErr: `invalid character 'x' in \u hexadecimal character escape`}, + "syntax-12": {In: `"\u000x`, ExpErr: `invalid character 'x' in \u hexadecimal character escape`}, + "syntax-13": {In: `-x`, ExpErr: `invalid character 'x' in numeric literal`}, + "syntax-14": {In: `0.x`, ExpErr: `invalid character 'x' after decimal point in numeric literal`}, + "syntax-15": {In: `1ex`, ExpErr: `invalid character 'x' in exponent of numeric literal`}, + "syntax-16": {In: `1e+x`, ExpErr: `invalid character 'x' in exponent of numeric literal`}, + "syntax-17": {In: `fx`, ExpErr: `invalid character 'x' in literal false (expecting 'a')`}, } for tcName, tc := range testcases { tc := tc diff --git a/compat/json/testdata/fuzz/FuzzEquiv/1071d2f6e5b5f7d3 b/compat/json/testdata/fuzz/FuzzEquiv/1071d2f6e5b5f7d3 new file mode 100644 index 0000000..1095817 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/1071d2f6e5b5f7d3 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("0EA") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/6bced2300496f15c b/compat/json/testdata/fuzz/FuzzEquiv/6bced2300496f15c new file mode 100644 index 0000000..4bc9c61 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/6bced2300496f15c @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("{0") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/6daf246742074967 b/compat/json/testdata/fuzz/FuzzEquiv/6daf246742074967 new file mode 100644 index 0000000..b1c3453 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/6daf246742074967 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\"\\uX") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/7c3168c77fc059cb b/compat/json/testdata/fuzz/FuzzEquiv/7c3168c77fc059cb new file mode 100644 index 0000000..b95f079 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/7c3168c77fc059cb @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\"\x1e") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/a955c588d78b5c3a b/compat/json/testdata/fuzz/FuzzEquiv/a955c588d78b5c3a new file mode 100644 index 0000000..b135daa --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/a955c588d78b5c3a @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("0.A") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/cf667c6f1f3282c1 b/compat/json/testdata/fuzz/FuzzEquiv/cf667c6f1f3282c1 new file mode 100644 index 0000000..f6ab571 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/cf667c6f1f3282c1 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\"\\0") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/ef2c8755a89034da b/compat/json/testdata/fuzz/FuzzEquiv/ef2c8755a89034da new file mode 100644 index 0000000..7d9478d --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/ef2c8755a89034da @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("0E+A") diff --git a/internal/jsonparse/parse.go b/internal/jsonparse/parse.go index 6432d75..c641f1b 100644 --- a/internal/jsonparse/parse.go +++ b/internal/jsonparse/parse.go @@ -619,7 +619,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { par.popState() return RuneTypeObjectEnd, nil default: - return RuneTypeError, fmt.Errorf("object: unexpected character: %q", c) + return RuneTypeError, fmt.Errorf("invalid character %q looking for beginning of object key string", c) } case RuneTypeStringEnd: // waiting for ':' switch c { @@ -682,7 +682,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case 0x0020 <= c && c <= 0x10FFFF: return RuneTypeStringChar, nil default: - return RuneTypeError, fmt.Errorf("string: unexpected character: %q", c) + return RuneTypeError, fmt.Errorf("invalid character %q in string literal", c) } case RuneTypeStringEsc: // waiting for escape char switch c { @@ -692,26 +692,26 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case 'u': return par.replaceState(RuneTypeStringEscU), nil default: - return RuneTypeError, fmt.Errorf("string backslash sequence: unexpected character: %q", c) + return RuneTypeError, fmt.Errorf("invalid character %q in string escape code", c) } case RuneTypeStringEscU: if !isHex(c) { - return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) + return RuneTypeError, fmt.Errorf(`invalid character %q in \u hexadecimal character escape`, c) } return par.replaceState(RuneTypeStringEscUA), nil case RuneTypeStringEscUA: if !isHex(c) { - return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) + return RuneTypeError, fmt.Errorf(`invalid character %q in \u hexadecimal character escape`, c) } return par.replaceState(RuneTypeStringEscUB), nil case RuneTypeStringEscUB: if !isHex(c) { - return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) + return RuneTypeError, fmt.Errorf(`invalid character %q in \u hexadecimal character escape`, c) } return par.replaceState(RuneTypeStringEscUC), nil case RuneTypeStringEscUC: if !isHex(c) { - return RuneTypeError, fmt.Errorf("string unicode sequence: unexpected character: %q", c) + return RuneTypeError, fmt.Errorf(`invalid character %q in \u hexadecimal character escape`, c) } par.replaceState(RuneTypeStringBeg) return RuneTypeStringEscUD, nil @@ -791,7 +791,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': return par.replaceState(RuneTypeNumberFracDig), nil default: - return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c) + return RuneTypeError, fmt.Errorf("invalid character %q after decimal point in numeric literal", c) } case RuneTypeNumberFracDig: // F switch c { @@ -810,14 +810,14 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': return par.replaceState(RuneTypeNumberExpDig), nil default: - return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c) + return RuneTypeError, fmt.Errorf("invalid character %q in exponent of numeric literal", c) } case RuneTypeNumberExpSign: // H switch c { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': return par.replaceState(RuneTypeNumberExpDig), nil default: - return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c) + return RuneTypeError, fmt.Errorf("invalid character %q in exponent of numeric literal", c) } case RuneTypeNumberExpDig: // I switch c { diff --git a/methods_test.go b/methods_test.go index 64af28c..23a6c4f 100644 --- a/methods_test.go +++ b/methods_test.go @@ -304,8 +304,8 @@ func TestMethodsDecode(t *testing.T) { testcases := map[string]testcase{ "decode-basic": {In: `{}`, Obj: &tstDecoder{n: 2}}, "decode-basic-eof": {In: `{}`, Obj: &tstDecoder{n: 5}}, - "decode-syntax-error": {In: `{x}`, Obj: &tstDecoder{n: 5}, ExpectedErr: `json: v: syntax error at input byte 1: object: unexpected character: 'x'`}, - "unmarshal-syntax-error": {In: `{x}`, Obj: &strUnmarshaler{}, ExpectedErr: `json: v: syntax error at input byte 1: object: unexpected character: 'x'`}, + "decode-syntax-error": {In: `{x}`, Obj: &tstDecoder{n: 5}, ExpectedErr: `json: v: syntax error at input byte 1: invalid character 'x' looking for beginning of object key string`}, + "unmarshal-syntax-error": {In: `{x}`, Obj: &strUnmarshaler{}, ExpectedErr: `json: v: syntax error at input byte 1: invalid character 'x' looking for beginning of object key string`}, "decode-short": {In: `{}`, Obj: &tstDecoder{n: 1}, ExpectedErr: `json: v: cannot decode JSON object at input byte 0 into Go *lowmemjson_test.tstDecoder: did not consume entire object`}, "decode-err": {In: `{}`, Obj: &tstDecoder{err: "xxx"}, ExpectedErr: `json: v: cannot decode JSON object at input byte 0 into Go *lowmemjson_test.tstDecoder: xxx`}, "decode-err2": {In: `{}`, Obj: &tstDecoder{n: 1, err: "yyy"}, ExpectedErr: `json: v: cannot decode JSON object at input byte 0 into Go *lowmemjson_test.tstDecoder: yyy`}, -- cgit v1.2.3 From 7301cd6c4e97b272bd708d4c87d26609510e6ca7 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Sat, 25 Feb 2023 17:12:39 -0700 Subject: jsonparse: Define an InvalidCharacterError type instead of using fmt.Errorf --- internal/jsonparse/parse.go | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/internal/jsonparse/parse.go b/internal/jsonparse/parse.go index c641f1b..214e3ba 100644 --- a/internal/jsonparse/parse.go +++ b/internal/jsonparse/parse.go @@ -14,6 +14,15 @@ import ( var ErrParserExceededMaxDepth = errors.New("exceeded max depth") +type InvalidCharacterError struct { + Char rune + Where string +} + +func (e *InvalidCharacterError) Error() string { + return fmt.Sprintf("invalid character %q %s", e.Char, e.Where) +} + func isHex(c rune) bool { return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || @@ -571,7 +580,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { if len(par.barriers) > 0 { return RuneTypeEOF, nil } else { - return RuneTypeError, fmt.Errorf("invalid character %q after top-level value", c) + return RuneTypeError, &InvalidCharacterError{c, "after top-level value"} } } switch par.stack[len(par.stack)-1] { @@ -605,7 +614,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case 'n': return par.replaceState(RuneTypeNullN), nil default: - return RuneTypeError, fmt.Errorf("invalid character %q looking for beginning of value", c) + return RuneTypeError, &InvalidCharacterError{c, "looking for beginning of value"} } // object ////////////////////////////////////////////////////////////////////////////////// case RuneTypeObjectBeg: // waiting for key to start or '}' @@ -619,7 +628,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { par.popState() return RuneTypeObjectEnd, nil default: - return RuneTypeError, fmt.Errorf("invalid character %q looking for beginning of object key string", c) + return RuneTypeError, &InvalidCharacterError{c, "looking for beginning of object key string"} } case RuneTypeStringEnd: // waiting for ':' switch c { @@ -630,7 +639,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { par.pushState(runeTypeAny) return RuneTypeObjectColon, nil default: - return RuneTypeError, fmt.Errorf("invalid character %q after object key", c) + return RuneTypeError, &InvalidCharacterError{c, "after object key"} } case RuneTypeObjectComma: // waiting for ',' or '}' switch c { @@ -643,7 +652,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { par.popState() return RuneTypeObjectEnd, nil default: - return RuneTypeError, fmt.Errorf("invalid character %q after object key:value pair", c) + return RuneTypeError, &InvalidCharacterError{c, "after object key:value pair"} } // array /////////////////////////////////////////////////////////////////////////////////// case RuneTypeArrayBeg: // waiting for item to start or ']' @@ -669,7 +678,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { par.popState() return RuneTypeArrayEnd, nil default: - return RuneTypeError, fmt.Errorf("invalid character %q after array element", c) + return RuneTypeError, &InvalidCharacterError{c, "after array element"} } // string ////////////////////////////////////////////////////////////////////////////////// case RuneTypeStringBeg: // waiting for char or '"' @@ -682,7 +691,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case 0x0020 <= c && c <= 0x10FFFF: return RuneTypeStringChar, nil default: - return RuneTypeError, fmt.Errorf("invalid character %q in string literal", c) + return RuneTypeError, &InvalidCharacterError{c, "in string literal"} } case RuneTypeStringEsc: // waiting for escape char switch c { @@ -692,7 +701,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case 'u': return par.replaceState(RuneTypeStringEscU), nil default: - return RuneTypeError, fmt.Errorf("invalid character %q in string escape code", c) + return RuneTypeError, &InvalidCharacterError{c, "in string escape code"} } case RuneTypeStringEscU: if !isHex(c) { @@ -762,7 +771,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case '1', '2', '3', '4', '5', '6', '7', '8', '9': return par.replaceState(RuneTypeNumberIntDig), nil default: - return RuneTypeError, fmt.Errorf("invalid character %q in numeric literal", c) + return RuneTypeError, &InvalidCharacterError{c, "in numeric literal"} } case RuneTypeNumberIntZero: // C switch c { @@ -791,7 +800,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': return par.replaceState(RuneTypeNumberFracDig), nil default: - return RuneTypeError, fmt.Errorf("invalid character %q after decimal point in numeric literal", c) + return RuneTypeError, &InvalidCharacterError{c, "after decimal point in numeric literal"} } case RuneTypeNumberFracDig: // F switch c { @@ -810,14 +819,14 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': return par.replaceState(RuneTypeNumberExpDig), nil default: - return RuneTypeError, fmt.Errorf("invalid character %q in exponent of numeric literal", c) + return RuneTypeError, &InvalidCharacterError{c, "in exponent of numeric literal"} } case RuneTypeNumberExpSign: // H switch c { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': return par.replaceState(RuneTypeNumberExpDig), nil default: - return RuneTypeError, fmt.Errorf("invalid character %q in exponent of numeric literal", c) + return RuneTypeError, &InvalidCharacterError{c, "in exponent of numeric literal"} } case RuneTypeNumberExpDig: // I switch c { @@ -858,7 +867,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { func (par *Parser) expectRune(c, exp rune, typ RuneType, context string, pop bool) (RuneType, error) { if c != exp { - return RuneTypeError, fmt.Errorf("invalid character %q in literal %s (expecting %q)", c, context, exp) + return RuneTypeError, &InvalidCharacterError{c, fmt.Sprintf("in literal %s (expecting %q)", context, exp)} } if pop { par.popState() -- cgit v1.2.3 From f68498a6fdb421483d9aebb45527452f6255bb68 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Sat, 25 Feb 2023 16:17:01 -0700 Subject: jsonparse: Don't show raw bytes as Unicode --- ReleaseNotes.md | 4 +++ compat/json/compat_test.go | 8 ++--- compat/json/equiv_test.go | 29 +++++++++++++++- compat/json/testcompat_test.go | 2 +- decode_scan.go | 23 +++++++------ internal/jsonparse/parse.go | 73 +++++++++++++++++++++------------------- internal/jsonparse/parse_test.go | 2 +- reencode.go | 2 +- 8 files changed, 90 insertions(+), 53 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 1981678..e047f7d 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -21,6 +21,10 @@ invalid UTF-8, include that byte value in the error message rather than including the U+FFFD Unicode replacement character. + - Bugfix: Syntax errors on raw-bytes (for invalid UTF-8) no longer + show the raw byte as a `\u00XX` Unicode codepoint, but now as a + `\xXX` byte. + # v0.3.7 (2023-02-20) Theme: Fixes from fuzzing (part 1?) diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index cf9e359..2380c53 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -79,7 +79,7 @@ func TestCompatCompact(t *testing.T) { "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`}, "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`}, "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`}, - "invalid-utf8": {In: "\x85", Err: `invalid character '\u0085' looking for beginning of value`}, + "invalid-utf8": {In: "\x85", Err: `invalid character '\x85' looking for beginning of value`}, } for tcName, tc := range testcases { tc := tc @@ -120,7 +120,7 @@ func TestCompatIndent(t *testing.T) { "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`}, "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`}, "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`}, - "invalid-utf8": {In: "\x85", Err: `invalid character '\u0085' looking for beginning of value`}, + "invalid-utf8": {In: "\x85", Err: `invalid character '\x85' looking for beginning of value`}, } for tcName, tc := range testcases { tc := tc @@ -183,7 +183,7 @@ func TestCompatUnmarshal(t *testing.T) { "two-objs": {In: `{} {}`, ExpOut: nil, ExpErr: `invalid character '{' after top-level value`}, "two-numbers1": {In: `00`, ExpOut: nil, ExpErr: `invalid character '0' after top-level value`}, "two-numbers2": {In: `1 2`, ExpOut: nil, ExpErr: `invalid character '2' after top-level value`}, - "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\u0085' looking for beginning of value`}, + "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\x85' looking for beginning of value`}, // 2e308 is slightly more than math.MaxFloat64 (~1.79e308) "obj-overflow": {In: `{"foo":"bar", "baz":2e308, "qux": "orb"}`, ExpOut: map[string]any{"foo": "bar", "baz": nil, "qux": "orb"}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, "ary-overflow": {In: `["foo",2e308,"bar",3e308]`, ExpOut: []any{"foo", nil, "bar", nil}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, @@ -244,7 +244,7 @@ func TestCompatDecode(t *testing.T) { "two-objs": {In: `{} {}`, ExpOut: map[string]any{}}, "two-numbers1": {In: `00`, ExpOut: float64(0)}, "two-numbers2": {In: `1 2`, ExpOut: float64(1)}, - "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\u0085' looking for beginning of value`}, + "invalid-utf8": {In: "\x85", ExpErr: `invalid character '\x85' looking for beginning of value`}, // 2e308 is slightly more than math.MaxFloat64 (~1.79e308) "obj-overflow": {In: `{"foo":"bar", "baz":2e308, "qux": "orb"}`, ExpOut: map[string]any{"foo": "bar", "baz": nil, "qux": "orb"}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, "ary-overflow": {In: `["foo",2e308,"bar",3e308]`, ExpOut: []any{"foo", nil, "bar", nil}, ExpErr: `json: cannot unmarshal number 2e308 into Go value of type float64`}, diff --git a/compat/json/equiv_test.go b/compat/json/equiv_test.go index 246e4b3..cb02f43 100644 --- a/compat/json/equiv_test.go +++ b/compat/json/equiv_test.go @@ -44,8 +44,27 @@ func assertEquivErr(t *testing.T, stdErr, lowErr error) { lowByte := lowMsg[len(prefix)] if lowByte == '\\' { switch lowMsg[len(prefix)+1] { + case 'a': + lowByte = '\a' + case 'b': + lowByte = '\b' + case 'f': + lowByte = '\f' + case 'n': + lowByte = '\n' + case 'r': + lowByte = '\r' + case 't': + lowByte = '\t' + case 'v': + lowByte = '\v' + case '\\', '\'': + lowByte = lowMsg[len(prefix)+1] + case 'x': + lowByte64, _ := strconv.ParseUint(lowMsg[len(prefix)+2:][:2], 16, 8) + lowByte = byte(lowByte64) case 'u': - lowRune, _ := strconv.ParseUint(lowMsg[len(prefix)+2:][:4], 16, 32) + lowRune, _ := strconv.ParseUint(lowMsg[len(prefix)+2:][:4], 16, 16) var buf [4]byte utf8.EncodeRune(buf[:], rune(lowRune)) lowByte = buf[0] @@ -63,6 +82,14 @@ func assertEquivErr(t *testing.T, stdErr, lowErr error) { stdErr = errors.New(stdMsg) } } + + // I'd file a ticket for this, but @dsnet (one of the encoding/json maintainers) says that he's + // working on a parser-rewrite that would fix a bunch of this type of issue. + // https://github.com/golang/go/issues/58680#issuecomment-1444224084 + if strings.HasPrefix(stdMsg, `invalid character '\u00`) && strings.HasPrefix(lowMsg, `invalid character '\x`) { + stdMsg = `invalid character '\x` + strings.TrimPrefix(stdMsg, `invalid character '\u00`) + stdErr = errors.New(stdMsg) + } } // Text-equal. assert.Equal(t, stdErr.Error(), lowErr.Error()) diff --git a/compat/json/testcompat_test.go b/compat/json/testcompat_test.go index 73153d9..affcd7c 100644 --- a/compat/json/testcompat_test.go +++ b/compat/json/testcompat_test.go @@ -32,7 +32,7 @@ func checkValid(in []byte, scan *lowmemjson.ReEncoderConfig) error { func isValidNumber(s string) bool { var parser jsonparse.Parser for _, r := range s { - if t, _ := parser.HandleRune(r); !t.IsNumber() { + if t, _ := parser.HandleRune(r, true); !t.IsNumber() { return false } } diff --git a/decode_scan.go b/decode_scan.go index 940de49..7ef3e71 100644 --- a/decode_scan.go +++ b/decode_scan.go @@ -23,10 +23,11 @@ type runeTypeScanner struct { rTypeOK bool repeat bool - rRune rune - rSize int - rType jsonparse.RuneType - rErr error + rRune rune + rSize int + rIsRune bool + rType jsonparse.RuneType + rErr error } // The returned error is a *ReadError, a *SyntaxError, or nil. @@ -56,7 +57,7 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error) sc.offset += int64(sc.rSize) switch err { case nil: - invalidUTF8 := false + sc.rIsRune = true if sc.rRune == utf8.RuneError && sc.rSize == 1 { if bs, ok := sc.inner.(io.ByteScanner); ok { _ = bs.UnreadByte() // UnreadRune doesn't back up the ReadByte-pos @@ -64,19 +65,16 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error) _ = bs.UnreadByte() _, _, _ = sc.inner.ReadRune() sc.rRune = rune(b) - invalidUTF8 = true + sc.rIsRune = false } } - sc.rType, err = sc.parser.HandleRune(sc.rRune) + sc.rType, err = sc.parser.HandleRune(sc.rRune, sc.rIsRune) if err != nil { sc.rErr = &DecodeSyntaxError{ Offset: sc.offset - int64(sc.rSize), Err: err, } } else { - if invalidUTF8 { - sc.rRune = utf8.RuneError - } sc.rErr = nil } switch sc.rType { @@ -107,6 +105,9 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error) } } sc.repeat = false + if sc.rSize > 0 && !sc.rIsRune { + return utf8.RuneError, sc.rSize, sc.rType, sc.rErr + } return sc.rRune, sc.rSize, sc.rType, sc.rErr } @@ -139,7 +140,7 @@ func (sc *runeTypeScanner) PopReadBarrier() { case sc.repeat: // re-figure the rType and rErr var err error - sc.rType, err = sc.parser.HandleRune(sc.rRune) + sc.rType, err = sc.parser.HandleRune(sc.rRune, sc.rIsRune) if err != nil { sc.rErr = &DecodeSyntaxError{ Offset: sc.offset - int64(sc.rSize), diff --git a/internal/jsonparse/parse.go b/internal/jsonparse/parse.go index 214e3ba..5547df4 100644 --- a/internal/jsonparse/parse.go +++ b/internal/jsonparse/parse.go @@ -15,12 +15,17 @@ import ( var ErrParserExceededMaxDepth = errors.New("exceeded max depth") type InvalidCharacterError struct { - Char rune - Where string + Char rune + IsRune bool + Where string } func (e *InvalidCharacterError) Error() string { - return fmt.Sprintf("invalid character %q %s", e.Char, e.Where) + if e.IsRune { + return fmt.Sprintf("invalid character %q %s", e.Char, e.Where) + } else { + return fmt.Sprintf("invalid character '\\x%02x' %s", e.Char, e.Where) + } } func isHex(c rune) bool { @@ -520,7 +525,7 @@ func (par *Parser) HandleEOF() (RuneType, error) { case 1: switch { case par.stack[0].IsNumber(): - if _, err := par.HandleRune('\n'); err == nil { + if _, err := par.HandleRune('\n', true); err == nil { return RuneTypeEOF, nil } case par.stack[0] == runeTypeAny: @@ -562,7 +567,7 @@ func (par *Parser) IsAtBarrier() bool { // RuneTypeEOF indicates that the rune cannot be appended to the JSON // document; a new JSON document must be started in order to process // that rune. -func (par *Parser) HandleRune(c rune) (RuneType, error) { +func (par *Parser) HandleRune(c rune, isRune bool) (RuneType, error) { if par.closed { return RuneTypeError, iofs.ErrClosed } @@ -580,7 +585,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { if len(par.barriers) > 0 { return RuneTypeEOF, nil } else { - return RuneTypeError, &InvalidCharacterError{c, "after top-level value"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "after top-level value"} } } switch par.stack[len(par.stack)-1] { @@ -614,7 +619,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case 'n': return par.replaceState(RuneTypeNullN), nil default: - return RuneTypeError, &InvalidCharacterError{c, "looking for beginning of value"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "looking for beginning of value"} } // object ////////////////////////////////////////////////////////////////////////////////// case RuneTypeObjectBeg: // waiting for key to start or '}' @@ -628,7 +633,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { par.popState() return RuneTypeObjectEnd, nil default: - return RuneTypeError, &InvalidCharacterError{c, "looking for beginning of object key string"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "looking for beginning of object key string"} } case RuneTypeStringEnd: // waiting for ':' switch c { @@ -639,7 +644,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { par.pushState(runeTypeAny) return RuneTypeObjectColon, nil default: - return RuneTypeError, &InvalidCharacterError{c, "after object key"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "after object key"} } case RuneTypeObjectComma: // waiting for ',' or '}' switch c { @@ -652,7 +657,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { par.popState() return RuneTypeObjectEnd, nil default: - return RuneTypeError, &InvalidCharacterError{c, "after object key:value pair"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "after object key:value pair"} } // array /////////////////////////////////////////////////////////////////////////////////// case RuneTypeArrayBeg: // waiting for item to start or ']' @@ -665,7 +670,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { default: par.replaceState(RuneTypeArrayComma) par.pushState(runeTypeAny) - return par.HandleRune(c) + return par.HandleRune(c, isRune) } case RuneTypeArrayComma: // waiting for ',' or ']' switch c { @@ -678,7 +683,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { par.popState() return RuneTypeArrayEnd, nil default: - return RuneTypeError, &InvalidCharacterError{c, "after array element"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "after array element"} } // string ////////////////////////////////////////////////////////////////////////////////// case RuneTypeStringBeg: // waiting for char or '"' @@ -691,7 +696,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case 0x0020 <= c && c <= 0x10FFFF: return RuneTypeStringChar, nil default: - return RuneTypeError, &InvalidCharacterError{c, "in string literal"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "in string literal"} } case RuneTypeStringEsc: // waiting for escape char switch c { @@ -701,7 +706,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case 'u': return par.replaceState(RuneTypeStringEscU), nil default: - return RuneTypeError, &InvalidCharacterError{c, "in string escape code"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "in string escape code"} } case RuneTypeStringEscU: if !isHex(c) { @@ -771,7 +776,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case '1', '2', '3', '4', '5', '6', '7', '8', '9': return par.replaceState(RuneTypeNumberIntDig), nil default: - return RuneTypeError, &InvalidCharacterError{c, "in numeric literal"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "in numeric literal"} } case RuneTypeNumberIntZero: // C switch c { @@ -781,7 +786,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { return par.replaceState(RuneTypeNumberExpE), nil default: par.popState() - return par.HandleRune(c) + return par.HandleRune(c, isRune) } case RuneTypeNumberIntDig: // D switch c { @@ -793,14 +798,14 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { return par.replaceState(RuneTypeNumberExpE), nil default: par.popState() - return par.HandleRune(c) + return par.HandleRune(c, isRune) } case RuneTypeNumberFracDot: // E switch c { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': return par.replaceState(RuneTypeNumberFracDig), nil default: - return RuneTypeError, &InvalidCharacterError{c, "after decimal point in numeric literal"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "after decimal point in numeric literal"} } case RuneTypeNumberFracDig: // F switch c { @@ -810,7 +815,7 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { return par.replaceState(RuneTypeNumberExpE), nil default: par.popState() - return par.HandleRune(c) + return par.HandleRune(c, isRune) } case RuneTypeNumberExpE: // G switch c { @@ -819,14 +824,14 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': return par.replaceState(RuneTypeNumberExpDig), nil default: - return RuneTypeError, &InvalidCharacterError{c, "in exponent of numeric literal"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "in exponent of numeric literal"} } case RuneTypeNumberExpSign: // H switch c { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': return par.replaceState(RuneTypeNumberExpDig), nil default: - return RuneTypeError, &InvalidCharacterError{c, "in exponent of numeric literal"} + return RuneTypeError, &InvalidCharacterError{c, isRune, "in exponent of numeric literal"} } case RuneTypeNumberExpDig: // I switch c { @@ -834,40 +839,40 @@ func (par *Parser) HandleRune(c rune) (RuneType, error) { return par.replaceState(RuneTypeNumberExpDig), nil default: par.popState() - return par.HandleRune(c) + return par.HandleRune(c, isRune) } // literals //////////////////////////////////////////////////////////////////////////////// // true case RuneTypeTrueT: - return par.expectRune(c, 'r', RuneTypeTrueR, "true", false) + return par.expectRune(c, isRune, 'r', RuneTypeTrueR, "true", false) case RuneTypeTrueR: - return par.expectRune(c, 'u', RuneTypeTrueU, "true", false) + return par.expectRune(c, isRune, 'u', RuneTypeTrueU, "true", false) case RuneTypeTrueU: - return par.expectRune(c, 'e', RuneTypeTrueE, "true", true) + return par.expectRune(c, isRune, 'e', RuneTypeTrueE, "true", true) // false case RuneTypeFalseF: - return par.expectRune(c, 'a', RuneTypeFalseA, "false", false) + return par.expectRune(c, isRune, 'a', RuneTypeFalseA, "false", false) case RuneTypeFalseA: - return par.expectRune(c, 'l', RuneTypeFalseL, "false", false) + return par.expectRune(c, isRune, 'l', RuneTypeFalseL, "false", false) case RuneTypeFalseL: - return par.expectRune(c, 's', RuneTypeFalseS, "false", false) + return par.expectRune(c, isRune, 's', RuneTypeFalseS, "false", false) case RuneTypeFalseS: - return par.expectRune(c, 'e', RuneTypeFalseE, "false", true) + return par.expectRune(c, isRune, 'e', RuneTypeFalseE, "false", true) // null case RuneTypeNullN: - return par.expectRune(c, 'u', RuneTypeNullU, "null", false) + return par.expectRune(c, isRune, 'u', RuneTypeNullU, "null", false) case RuneTypeNullU: - return par.expectRune(c, 'l', RuneTypeNullL1, "null", false) + return par.expectRune(c, isRune, 'l', RuneTypeNullL1, "null", false) case RuneTypeNullL1: - return par.expectRune(c, 'l', RuneTypeNullL2, "null", true) + return par.expectRune(c, isRune, 'l', RuneTypeNullL2, "null", true) default: panic(fmt.Errorf(`should not happen: invalid stack: "%s"`, par.stackString())) } } -func (par *Parser) expectRune(c, exp rune, typ RuneType, context string, pop bool) (RuneType, error) { +func (par *Parser) expectRune(c rune, isRune bool, exp rune, typ RuneType, context string, pop bool) (RuneType, error) { if c != exp { - return RuneTypeError, &InvalidCharacterError{c, fmt.Sprintf("in literal %s (expecting %q)", context, exp)} + return RuneTypeError, &InvalidCharacterError{c, isRune, fmt.Sprintf("in literal %s (expecting %q)", context, exp)} } if pop { par.popState() diff --git a/internal/jsonparse/parse_test.go b/internal/jsonparse/parse_test.go index e531daf..acb43e8 100644 --- a/internal/jsonparse/parse_test.go +++ b/internal/jsonparse/parse_test.go @@ -69,7 +69,7 @@ func TestParserHandleRune(t *testing.T) { } for i, r := range tc.Input { assert.Equal(t, tc.ExpStack[i], par.stackString()) - _, err := par.HandleRune(r) + _, err := par.HandleRune(r, true) assert.NoError(t, err) assert.Equal(t, tc.ExpStack[i+1], par.stackString()) } diff --git a/reencode.go b/reencode.go index 7439bf0..8b08aad 100644 --- a/reencode.go +++ b/reencode.go @@ -352,7 +352,7 @@ func (enc *ReEncoder) Close() error { // isRune=false indicates that 'c' is a raw byte from invalid UTF-8. func (enc *ReEncoder) handleRune(c rune, size int, isRune bool) { - t, err := enc.par.HandleRune(c) + t, err := enc.par.HandleRune(c, isRune) if err != nil { enc.err = &ReEncodeSyntaxError{ Err: err, -- cgit v1.2.3 From 2a41777072f48467bef02bb3bd670d95c2b02102 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Tue, 14 Feb 2023 18:55:57 -0700 Subject: compat/json: Handle io.EOF and io.ErrUnexpectedEOF the same --- ReleaseNotes.md | 3 +++ compat/json/compat.go | 25 +++++++++++----------- compat/json/compat_test.go | 2 ++ .../json/testdata/fuzz/FuzzEquiv/930f49fab2367014 | 2 ++ .../json/testdata/fuzz/FuzzEquiv/caf81e9797b19c76 | 2 ++ 5 files changed, 21 insertions(+), 13 deletions(-) create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/930f49fab2367014 create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/caf81e9797b19c76 diff --git a/ReleaseNotes.md b/ReleaseNotes.md index e047f7d..b7a8f76 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -25,6 +25,9 @@ show the raw byte as a `\u00XX` Unicode codepoint, but now as a `\xXX` byte. + - Bugfix: compat/json: `io.EOF` is now correctly converted to + "unexpected end of JSON input", same as `io.ErrUnexpectedEOF`. + # v0.3.7 (2023-02-20) Theme: Fixes from fuzzing (part 1?) diff --git a/compat/json/compat.go b/compat/json/compat.go index 6f13fbb..4dc15ab 100644 --- a/compat/json/compat.go +++ b/compat/json/compat.go @@ -53,19 +53,18 @@ func convertError(err error, isUnmarshal bool) error { case *lowmemjson.DecodeReadError: return err case *lowmemjson.DecodeSyntaxError: - switch { - case errors.Is(err, io.EOF): - return io.EOF - case errors.Is(err, io.ErrUnexpectedEOF) && isUnmarshal: - return &SyntaxError{ - msg: "unexpected end of JSON input", - Offset: suberr.Offset, - } - default: - return &SyntaxError{ - msg: suberr.Err.Error(), - Offset: suberr.Offset + 1, + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) { + if isUnmarshal { + return &SyntaxError{ + msg: "unexpected end of JSON input", + Offset: suberr.Offset, + } } + return suberr.Err + } + return &SyntaxError{ + msg: suberr.Err.Error(), + Offset: suberr.Offset + 1, } case *lowmemjson.DecodeTypeError: switch subsuberr := suberr.Err.(type) { @@ -120,7 +119,7 @@ func convertError(err error, isUnmarshal bool) error { msg: err.Err.Error(), Offset: err.Offset + 1, } - if errors.Is(err, io.ErrUnexpectedEOF) { + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) { ret.msg = "unexpected end of JSON input" } return ret diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index 2380c53..43f17f1 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -72,6 +72,7 @@ func TestCompatCompact(t *testing.T) { Err string } testcases := map[string]testcase{ + "empty": {In: ``, Out: ``, Err: `unexpected end of JSON input`}, "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, "object": {In: `{}`, Out: `{}`}, "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, @@ -106,6 +107,7 @@ func TestCompatIndent(t *testing.T) { Err string } testcases := map[string]testcase{ + "empty": {In: ``, Out: ``, Err: `unexpected end of JSON input`}, "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, "object": {In: `{}`, Out: `{}`}, "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, diff --git a/compat/json/testdata/fuzz/FuzzEquiv/930f49fab2367014 b/compat/json/testdata/fuzz/FuzzEquiv/930f49fab2367014 new file mode 100644 index 0000000..7390d06 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/930f49fab2367014 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte(" ") diff --git a/compat/json/testdata/fuzz/FuzzEquiv/caf81e9797b19c76 b/compat/json/testdata/fuzz/FuzzEquiv/caf81e9797b19c76 new file mode 100644 index 0000000..67322c7 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/caf81e9797b19c76 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("") -- cgit v1.2.3 From 4233e5012ece6d5a7fee3b5a518c41d916e1cf52 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Sat, 25 Feb 2023 18:12:15 -0700 Subject: compat/json: compat_test.go: Change the package to "json_test" --- compat/json/compat_test.go | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index 43f17f1..3de48f7 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: GPL-2.0-or-later -package json +package json_test import ( "bytes" @@ -11,6 +11,8 @@ import ( "testing" "github.com/stretchr/testify/assert" + + "git.lukeshu.com/go/lowmemjson/compat/json" ) func TestCompatHTMLEscape(t *testing.T) { @@ -31,7 +33,7 @@ func TestCompatHTMLEscape(t *testing.T) { t.Parallel() t.Logf("in=%q", tc.In) var dst bytes.Buffer - HTMLEscape(&dst, []byte(tc.In)) + json.HTMLEscape(&dst, []byte(tc.In)) assert.Equal(t, tc.Out, dst.String()) }) } @@ -58,7 +60,7 @@ func TestCompatValid(t *testing.T) { t.Run(tcName, func(t *testing.T) { t.Parallel() t.Logf("in=%q", tc.In) - act := Valid([]byte(tc.In)) + act := json.Valid([]byte(tc.In)) assert.Equal(t, tc.Exp, act) }) } @@ -88,7 +90,7 @@ func TestCompatCompact(t *testing.T) { t.Parallel() t.Logf("in=%q", tc.In) var out bytes.Buffer - err := Compact(&out, []byte(tc.In)) + err := json.Compact(&out, []byte(tc.In)) assert.Equal(t, tc.Out, out.String()) if tc.Err == "" { assert.NoError(t, err) @@ -130,7 +132,7 @@ func TestCompatIndent(t *testing.T) { t.Parallel() t.Logf("in=%q", tc.In) var out bytes.Buffer - err := Indent(&out, []byte(tc.In), ">", ".") + err := json.Indent(&out, []byte(tc.In), ">", ".") assert.Equal(t, tc.Out, out.String()) if tc.Err == "" { assert.NoError(t, err) @@ -157,7 +159,7 @@ func TestCompatMarshal(t *testing.T) { tc := tc t.Run(tcName, func(t *testing.T) { t.Parallel() - out, err := Marshal(tc.In) + out, err := json.Marshal(tc.In) assert.Equal(t, tc.Out, string(out)) if tc.Err == "" { assert.NoError(t, err) @@ -218,7 +220,7 @@ func TestCompatUnmarshal(t *testing.T) { var out any ptr = &out } - err := Unmarshal([]byte(tc.In), ptr) + err := json.Unmarshal([]byte(tc.In), ptr) assert.Equal(t, tc.ExpOut, reflect.ValueOf(ptr).Elem().Interface()) if tc.ExpErr == "" { assert.NoError(t, err) @@ -261,7 +263,7 @@ func TestCompatDecode(t *testing.T) { var out any ptr = &out } - err := NewDecoder(strings.NewReader(tc.In)).Decode(ptr) + err := json.NewDecoder(strings.NewReader(tc.In)).Decode(ptr) assert.Equal(t, tc.ExpOut, reflect.ValueOf(ptr).Elem().Interface()) if tc.ExpErr == "" { assert.NoError(t, err) -- cgit v1.2.3 From 7bd0072b5896bfc4172b6bda778cf149dd6282fa Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Sat, 25 Feb 2023 16:17:06 -0700 Subject: reencode: Fix the byte count for partial writes --- ReleaseNotes.md | 5 ++++ reencode.go | 14 +++++++--- reencode_test.go | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 4 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index b7a8f76..c9d1233 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -28,6 +28,11 @@ - Bugfix: compat/json: `io.EOF` is now correctly converted to "unexpected end of JSON input", same as `io.ErrUnexpectedEOF`. + - Bugfix: ReEncoder: Don't count bytes already in the UTF-8 decode + buffer toward the number of bytes returned from `.Write` and + `.WriteString`. This only comes up if there is an I/O causing a + partial write. + # v0.3.7 (2023-02-20) Theme: Fixes from fuzzing (part 1?) diff --git a/reencode.go b/reencode.go index 8b08aad..fd36875 100644 --- a/reencode.go +++ b/reencode.go @@ -243,10 +243,13 @@ func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, // but *ReEncoder does because it transforms the data written to it, // and the number of bytes written may be wildly different than the // number of bytes handled. +// +//nolint:dupl // Yes, this is mostly a duplicate of .WriteString(). func (enc *ReEncoder) Write(str []byte) (int, error) { if len(str) == 0 { return 0, nil } + origBufLen := enc.bufLen var n int for { c, size, full, isRune := enc.getRuneFromBytes(str, n) @@ -261,14 +264,14 @@ func (enc *ReEncoder) Write(str []byte) (int, error) { return len(str), nil } if enc.utf == InvalidUTF8Error && !isRune { - return n, &ReEncodeSyntaxError{ + return n - origBufLen, &ReEncodeSyntaxError{ Offset: enc.inputPos, Err: fmt.Errorf("invalid UTF-8: %#02x", c), } } enc.handleRune(c, size, isRune) if enc.err != nil { - return n, enc.err + return n - origBufLen, enc.err } n += size } @@ -276,10 +279,13 @@ func (enc *ReEncoder) Write(str []byte) (int, error) { // WriteString implements io.StringWriter; it does what you'd expect, // but see the notes on the Write method. +// +//nolint:dupl // Yes, this is mostly a duplicate of .Write(). func (enc *ReEncoder) WriteString(str string) (int, error) { if len(str) == 0 { return 0, nil } + origBufLen := enc.bufLen var n int for { c, size, full, isRune := enc.getRuneFromString(str, n) @@ -294,14 +300,14 @@ func (enc *ReEncoder) WriteString(str string) (int, error) { return len(str), nil } if enc.utf == InvalidUTF8Error && !isRune { - return n, &ReEncodeSyntaxError{ + return n - origBufLen, &ReEncodeSyntaxError{ Offset: enc.inputPos, Err: fmt.Errorf("invalid UTF-8: %#02x", c), } } enc.handleRune(c, size, isRune) if enc.err != nil { - return n, enc.err + return n - origBufLen, enc.err } n += size } diff --git a/reencode_test.go b/reencode_test.go index 715e976..feabde5 100644 --- a/reencode_test.go +++ b/reencode_test.go @@ -5,6 +5,8 @@ package lowmemjson import ( + "errors" + "io" "strings" "testing" @@ -240,3 +242,84 @@ func TestReEncoderStackSize(t *testing.T) { assert.Equal(t, i+2, enc.stackSize()) } } + +var errNoSpace = errors.New("no space left on device") + +type limitedWriter struct { + Limit int + Inner io.Writer + + n int +} + +func (w *limitedWriter) Write(p []byte) (int, error) { + switch { + case w.n >= w.Limit: + return 0, errNoSpace + case w.n+len(p) > w.Limit: + n, err := w.Inner.Write(p[:w.Limit-w.n]) + if n > 0 { + w.n += n + } + if err == nil { + err = errNoSpace + } + return n, err + default: + n, err := w.Inner.Write(p) + if n > 0 { + w.n += n + } + return n, err + } +} + +func TestReEncodeIOErr(t *testing.T) { + t.Parallel() + + input := `"๐Ÿ˜€"` + assert.Len(t, input, 6) + + t.Run("bytes", func(t *testing.T) { + t.Parallel() + + var out strings.Builder + enc := NewReEncoder(&limitedWriter{Limit: 5, Inner: &out}, ReEncoderConfig{}) + + n, err := enc.Write([]byte(input[:2])) + assert.NoError(t, err) + assert.Equal(t, 2, n) + // Of the 2 bytes "written", only one should be in + // `out` yet; the other should be in the UTF-8 buffer. + assert.Equal(t, input[:1], out.String()) + + n, err = enc.Write([]byte(input[2:])) + assert.ErrorIs(t, err, errNoSpace) + // Check that the byte in the UTF-8 buffer from the + // first .Write didn't count toward the total for this + // .Write. + assert.Equal(t, 3, n) + assert.Equal(t, input[:5], out.String()) + }) + t.Run("string", func(t *testing.T) { + t.Parallel() + + var out strings.Builder + enc := NewReEncoder(&limitedWriter{Limit: 5, Inner: &out}, ReEncoderConfig{}) + + n, err := enc.WriteString(input[:2]) + assert.NoError(t, err) + assert.Equal(t, 2, n) + // Of the 2 bytes "written", only one should be in + // `out` yet; the other should be in the UTF-8 buffer. + assert.Equal(t, input[:1], out.String()) + + n, err = enc.WriteString(input[2:]) + assert.ErrorIs(t, err, errNoSpace) + // Check that the byte in the UTF-8 buffer from the + // first .Write didn't count toward the total for this + // .Write. + assert.Equal(t, 3, n) + assert.Equal(t, input[:5], out.String()) + }) +} -- cgit v1.2.3 From 22edcf6a68a057ed04368d5f78c8ba3ddfee8d57 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Sat, 25 Feb 2023 11:11:36 -0700 Subject: reencode: Improve the error messages for trailing partial-UTF-8 --- ReleaseNotes.md | 4 +++ .../json/testdata/fuzz/FuzzEquiv/95640f7d88708118 | 2 ++ reencode.go | 18 ++++++++-- reencode_test.go | 39 +++++++++++++++++++++- 4 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 compat/json/testdata/fuzz/FuzzEquiv/95640f7d88708118 diff --git a/ReleaseNotes.md b/ReleaseNotes.md index c9d1233..71973aa 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -33,6 +33,10 @@ `.WriteString`. This only comes up if there is an I/O causing a partial write. + - Bugfix: ReEncoder: The error messages for trailing partial UTF-8 + now reflect the `InvalidUTF8` setting, rather than simply saying + "unflushed unicode garbage". + # v0.3.7 (2023-02-20) Theme: Fixes from fuzzing (part 1?) diff --git a/compat/json/testdata/fuzz/FuzzEquiv/95640f7d88708118 b/compat/json/testdata/fuzz/FuzzEquiv/95640f7d88708118 new file mode 100644 index 0000000..77924f3 --- /dev/null +++ b/compat/json/testdata/fuzz/FuzzEquiv/95640f7d88708118 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\xf0") diff --git a/reencode.go b/reencode.go index fd36875..c19e296 100644 --- a/reencode.go +++ b/reencode.go @@ -329,9 +329,21 @@ func (enc *ReEncoder) WriteRune(c rune) (n int, err error) { // if enc.AllowMultipleValues is set. func (enc *ReEncoder) Close() error { if enc.bufLen > 0 { - return &ReEncodeSyntaxError{ - Offset: enc.inputPos, - Err: fmt.Errorf("%w: unflushed unicode garbage: %q", io.ErrUnexpectedEOF, enc.buf[:enc.bufLen]), + if enc.utf == InvalidUTF8Error { + return &ReEncodeSyntaxError{ + Offset: enc.inputPos, + Err: fmt.Errorf("truncated UTF-8: %q", enc.buf[:enc.bufLen]), + } + } + for i := 0; i < enc.bufLen; i++ { + if enc.utf == InvalidUTF8Replace { + enc.handleRune(utf8.RuneError, 1, true) + } else { + enc.handleRune(rune(enc.buf[i]), 1, false) + } + if enc.err != nil { + return enc.err + } } } if _, err := enc.par.HandleEOF(); err != nil { diff --git a/reencode_test.go b/reencode_test.go index feabde5..60180c8 100644 --- a/reencode_test.go +++ b/reencode_test.go @@ -15,7 +15,7 @@ import ( "git.lukeshu.com/go/lowmemjson/internal/fastio" ) -func TestReEncode(t *testing.T) { +func TestEncodeReEncode(t *testing.T) { t.Parallel() type testcase struct { enc ReEncoderConfig @@ -168,6 +168,43 @@ func TestReEncode(t *testing.T) { } } +func TestReEncode(t *testing.T) { + t.Parallel() + type testcase struct { + Cfg ReEncoderConfig + In string + ExpOut string + ExpWriteErr string + ExpCloseErr string + } + testcases := map[string]testcase{ + "partial-utf8-replace": {Cfg: ReEncoderConfig{InvalidUTF8: InvalidUTF8Replace}, In: "\xf0\xbf", ExpOut: ``, ExpCloseErr: "json: syntax error at input byte 0: invalid character '\uFFFD' looking for beginning of value"}, + "partial-utf8-preserve": {Cfg: ReEncoderConfig{InvalidUTF8: InvalidUTF8Preserve}, In: "\xf0\xbf", ExpOut: ``, ExpCloseErr: `json: syntax error at input byte 0: invalid character '\xf0' looking for beginning of value`}, + "partial-utf8-error": {Cfg: ReEncoderConfig{InvalidUTF8: InvalidUTF8Error}, In: "\xf0\xbf", ExpOut: ``, ExpCloseErr: `json: syntax error at input byte 0: truncated UTF-8: "\xf0\xbf"`}, + } + for tcName, tc := range testcases { + tc := tc + t.Run(tcName, func(t *testing.T) { + t.Parallel() + var out strings.Builder + enc := NewReEncoder(&out, tc.Cfg) + _, err := enc.WriteString(tc.In) + assert.Equal(t, tc.ExpOut, out.String()) + if tc.ExpWriteErr == "" { + assert.NoError(t, err) + } else { + assert.EqualError(t, err, tc.ExpWriteErr) + } + err = enc.Close() + if tc.ExpCloseErr == "" { + assert.NoError(t, err) + } else { + assert.EqualError(t, err, tc.ExpCloseErr) + } + }) + } +} + func TestReEncodeWriteSize(t *testing.T) { t.Parallel() -- cgit v1.2.3