From 218edcc3878394a6942d4f72e3be99137c22825a Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Tue, 14 Feb 2023 14:27:18 -0700 Subject: reencode: Fix trimming trailing zeros --- ReleaseNotes.md | 11 +++++++++++ reencode_compactnum.go | 12 +++++------- reencode_test.go | 18 ++++++++++++++++++ 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 081644e..623d5da 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -1,3 +1,14 @@ +# v0.3.7 (TBD) + + Theme: TBD + + User-facing changes: + + - General bugfixes: + + + Encoder, ReEncoder: Now correctly trims unnecessary the + trailing '0's from the fraction-part when compacting numbers. + # v0.3.6 (2023-02-16) Theme: Architectural improvements diff --git a/reencode_compactnum.go b/reencode_compactnum.go index 5da2c54..bdf1f4e 100644 --- a/reencode_compactnum.go +++ b/reencode_compactnum.go @@ -33,14 +33,12 @@ func (enc *reEncodeCompactNum) HandleRune(c rune, t jsonparse.RuneType, escape B if c == '0' && !enc.fracFirst { enc.fracZeros++ return nil - } - fallthrough - default: - for enc.fracZeros > 0 { - if err := enc.out.HandleRune('0', jsonparse.RuneTypeNumberFracDig, escape, stackSize); err != nil { - return err + } else { + for ; enc.fracZeros > 0; enc.fracZeros-- { + if err := enc.out.HandleRune('0', jsonparse.RuneTypeNumberFracDig, escape, stackSize); err != nil { + return err + } } - enc.fracZeros-- } enc.fracFirst = false } diff --git a/reencode_test.go b/reencode_test.go index 83660ef..f135aa5 100644 --- a/reencode_test.go +++ b/reencode_test.go @@ -131,6 +131,24 @@ func TestReEncode(t *testing.T) { โ€”ยป9 โ€”]`, }, + "numbers": { + enc: ReEncoderConfig{ + Compact: true, + }, + in: []any{ + Number("1.200e003"), + }, + exp: `[1.2e3]`, + }, + "numbers-zero": { + enc: ReEncoderConfig{ + Compact: true, + }, + in: []any{ + Number("1.000e000"), + }, + exp: `[1.0e0]`, + }, } for tcName, tc := range testcases { tc := tc -- cgit v1.2.3 From eaaf7bc29d43b4470623c75e6e409a049b3083af Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Tue, 14 Feb 2023 11:02:33 -0700 Subject: compat/json: Valid: Check for EOF --- ReleaseNotes.md | 5 +++++ compat/json/compat.go | 9 +++++++-- compat/json/compat_test.go | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 2 deletions(-) create mode 100644 compat/json/compat_test.go diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 623d5da..613ea0c 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -9,6 +9,11 @@ + Encoder, ReEncoder: Now correctly trims unnecessary the trailing '0's from the fraction-part when compacting numbers. + - Compatibility bugfixes: + + + compat/json.Valid: No longer considers truncated JSON + documents to be valid. + # v0.3.6 (2023-02-16) Theme: Architectural improvements diff --git a/compat/json/compat.go b/compat/json/compat.go index c96470d..300ab2f 100644 --- a/compat/json/compat.go +++ b/compat/json/compat.go @@ -175,8 +175,13 @@ func Valid(data []byte) bool { formatter := lowmemjson.NewReEncoder(io.Discard, lowmemjson.ReEncoderConfig{ Compact: true, }) - _, err := formatter.Write(data) - return err == nil + if _, err := formatter.Write(data); err != nil { + return false + } + if err := formatter.Close(); err != nil { + return false + } + return true } // Decode wrappers /////////////////////////////////////////////////// diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go new file mode 100644 index 0000000..5c8f3ee --- /dev/null +++ b/compat/json/compat_test.go @@ -0,0 +1,34 @@ +// Copyright (C) 2023 Luke Shumaker +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package json + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestCompatValid(t *testing.T) { + t.Parallel() + type testcase struct { + In string + Exp bool + } + testcases := map[string]testcase{ + "empty": {In: ``, Exp: false}, + "num": {In: `1`, Exp: true}, + "trunc": {In: `{`, Exp: false}, + "object": {In: `{}`, Exp: true}, + } + for tcName, tc := range testcases { + tc := tc + t.Run(tcName, func(t *testing.T) { + t.Parallel() + t.Logf("in=%q", tc.In) + act := Valid([]byte(tc.In)) + assert.Equal(t, tc.Exp, act) + }) + } +} -- cgit v1.2.3 From 49319198500729fd65bd6d69071f45f2d7ae2aa7 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Tue, 14 Feb 2023 11:02:09 -0700 Subject: compat/json: Compact, Indent: Clear the output if there's an error --- ReleaseNotes.md | 3 +++ compat/json/compat.go | 14 ++++++++++-- compat/json/compat_test.go | 57 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 2 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 613ea0c..e72a664 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -14,6 +14,9 @@ + compat/json.Valid: No longer considers truncated JSON documents to be valid. + + compat/json.Compact, compat/json.Indent: Don't write to the + destination buffer if there is a syntax error. + # v0.3.6 (2023-02-16) Theme: Architectural improvements diff --git a/compat/json/compat.go b/compat/json/compat.go index 300ab2f..1cdbf0b 100644 --- a/compat/json/compat.go +++ b/compat/json/compat.go @@ -157,18 +157,28 @@ func reencode(dst io.Writer, src []byte, cfg lowmemjson.ReEncoderConfig) error { } func Compact(dst *bytes.Buffer, src []byte) error { - return reencode(dst, src, lowmemjson.ReEncoderConfig{ + start := dst.Len() + err := reencode(dst, src, lowmemjson.ReEncoderConfig{ Compact: true, BackslashEscape: lowmemjson.EscapePreserve, }) + if err != nil { + dst.Truncate(start) + } + return err } func Indent(dst *bytes.Buffer, src []byte, prefix, indent string) error { - return reencode(dst, src, lowmemjson.ReEncoderConfig{ + start := dst.Len() + err := reencode(dst, src, lowmemjson.ReEncoderConfig{ Indent: indent, Prefix: prefix, BackslashEscape: lowmemjson.EscapePreserve, }) + if err != nil { + dst.Truncate(start) + } + return err } func Valid(data []byte) bool { diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index 5c8f3ee..d513c27 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -5,6 +5,7 @@ package json import ( + "bytes" "testing" "github.com/stretchr/testify/assert" @@ -32,3 +33,59 @@ func TestCompatValid(t *testing.T) { }) } } + +func TestCompatCompact(t *testing.T) { + t.Parallel() + type testcase struct { + In string + Out string + Err string + } + testcases := map[string]testcase{ + "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, + "object": {In: `{}`, Out: `{}`}, + } + for tcName, tc := range testcases { + tc := tc + t.Run(tcName, func(t *testing.T) { + t.Parallel() + t.Logf("in=%q", tc.In) + var out bytes.Buffer + err := Compact(&out, []byte(tc.In)) + assert.Equal(t, tc.Out, out.String()) + if tc.Err == "" { + assert.NoError(t, err) + } else { + assert.EqualError(t, err, tc.Err) + } + }) + } +} + +func TestCompatIndent(t *testing.T) { + t.Parallel() + type testcase struct { + In string + Out string + Err string + } + testcases := map[string]testcase{ + "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, + "object": {In: `{}`, Out: `{}`}, + } + for tcName, tc := range testcases { + tc := tc + t.Run(tcName, func(t *testing.T) { + t.Parallel() + t.Logf("in=%q", tc.In) + var out bytes.Buffer + err := Indent(&out, []byte(tc.In), ">", ".") + assert.Equal(t, tc.Out, out.String()) + if tc.Err == "" { + assert.NoError(t, err) + } else { + assert.EqualError(t, err, tc.Err) + } + }) + } +} -- cgit v1.2.3 From 7a938da20e8d243bc254cd821b7cf61b379be4a6 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Wed, 15 Feb 2023 15:10:00 -0700 Subject: reencode: Rethink the UTF-8 buffer --- ReleaseNotes.md | 6 ++++ reencode.go | 95 +++++++++++++++++++++++++++++++++++++++----------------- reencode_test.go | 61 ++++++++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 29 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index e72a664..d9a671a 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -17,6 +17,12 @@ + compat/json.Compact, compat/json.Indent: Don't write to the destination buffer if there is a syntax error. + - Unicode: + + + Bugfix: Encoder, ReEncoder: Fix an issue with decoding UTF-8 + that when a codepoint straddles a write boundary it is + interpreted as a sequence of U+FFFD runes. + # v0.3.6 (2023-02-16) Theme: Architectural improvements diff --git a/reencode.go b/reencode.go index 0745c43..a33cc8f 100644 --- a/reencode.go +++ b/reencode.go @@ -169,6 +169,46 @@ var ( _ io.Closer = (*ReEncoder)(nil) ) +func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full bool) { + if pos < enc.bufLen { + var tmp [utf8.UTFMax]byte + n := copy(tmp[:], enc.buf[pos:enc.bufLen]) + n += copy(tmp[n:], str) + c, size := utf8.DecodeRune(tmp[:n]) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRune(tmp[:n]) + } + return c, size, true + } else { + tmp := str[pos-enc.bufLen:] + c, size := utf8.DecodeRune(tmp) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRune(tmp) + } + return c, size, true + } +} + +func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full bool) { + if pos < enc.bufLen { + var tmp [utf8.UTFMax]byte + n := copy(tmp[:], enc.buf[pos:enc.bufLen]) + n += copy(tmp[n:], str) + c, size := utf8.DecodeRune(tmp[:n]) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRune(tmp[:n]) + } + return c, size, true + } else { + tmp := str[pos-enc.bufLen:] + c, size := utf8.DecodeRuneInString(tmp) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRuneInString(tmp) + } + return c, size, true + } +} + // Write implements io.Writer; it does what you'd expect. // // It is worth noting that Write returns the number of bytes consumed @@ -177,59 +217,56 @@ var ( // but *ReEncoder does because it transforms the data written to it, // and the number of bytes written may be wildly different than the // number of bytes handled. -func (enc *ReEncoder) Write(p []byte) (int, error) { - if len(p) == 0 { +func (enc *ReEncoder) Write(str []byte) (int, error) { + if len(str) == 0 { return 0, nil } var n int - if enc.bufLen > 0 { - copy(enc.buf[enc.bufLen:], p) - c, size := utf8.DecodeRune(enc.buf[:]) - n += size - enc.bufLen - enc.bufLen = 0 - enc.handleRune(c, size) - if enc.err != nil { - return 0, enc.err + for { + c, size, full := enc.getRuneFromBytes(str, n) + if !full { + if n < enc.bufLen { + l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) + l += copy(enc.buf[l:], str) + enc.bufLen = l + } else { + enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:]) + } + return len(str), nil } - } - for utf8.FullRune(p[n:]) { - c, size := utf8.DecodeRune(p[n:]) enc.handleRune(c, size) if enc.err != nil { return n, enc.err } n += size } - enc.bufLen = copy(enc.buf[:], p[n:]) - return len(p), nil } // WriteString implements io.StringWriter; it does what you'd expect, // but see the notes on the Write method. -func (enc *ReEncoder) WriteString(p string) (int, error) { - if len(p) == 0 { +func (enc *ReEncoder) WriteString(str string) (int, error) { + if len(str) == 0 { return 0, nil } var n int - if enc.bufLen > 0 { - copy(enc.buf[enc.bufLen:], p) - c, size := utf8.DecodeRune(enc.buf[:]) - n += size - enc.bufLen - enc.bufLen = 0 - enc.handleRune(c, size) - if enc.err != nil { - return 0, enc.err + for { + c, size, full := enc.getRuneFromString(str, n) + if !full { + if n < enc.bufLen { + l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) + l += copy(enc.buf[l:], str) + enc.bufLen = l + } else { + enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:]) + } + return len(str), nil } - } - for utf8.FullRuneInString(p[n:]) { - c, size := utf8.DecodeRuneInString(p[n:]) enc.handleRune(c, size) if enc.err != nil { return n, enc.err } n += size } - return len(p), nil } // WriteByte implements io.ByteWriter; it does what you'd expect. diff --git a/reencode_test.go b/reencode_test.go index f135aa5..82a1861 100644 --- a/reencode_test.go +++ b/reencode_test.go @@ -161,3 +161,64 @@ func TestReEncode(t *testing.T) { }) } } + +func TestReEncodeWriteSize(t *testing.T) { + t.Parallel() + + multibyteRune := `๐Ÿ˜‚` + assert.Len(t, multibyteRune, 4) + + input := `"` + multibyteRune + `"` + + t.Run("bytes-bigwrite", func(t *testing.T) { + t.Parallel() + var out strings.Builder + enc := NewReEncoder(&out, ReEncoderConfig{}) + + n, err := enc.Write([]byte(input)) + assert.NoError(t, err) + assert.Equal(t, len(input), n) + + assert.Equal(t, input, out.String()) + }) + t.Run("string-bigwrite", func(t *testing.T) { + t.Parallel() + var out strings.Builder + enc := NewReEncoder(&out, ReEncoderConfig{}) + + n, err := enc.WriteString(input) + assert.NoError(t, err) + assert.Equal(t, len(input), n) + + assert.Equal(t, input, out.String()) + }) + + t.Run("bytes-smallwrites", func(t *testing.T) { + t.Parallel() + var out strings.Builder + enc := NewReEncoder(&out, ReEncoderConfig{}) + + var buf [1]byte + for i := 0; i < len(input); i++ { + buf[0] = input[i] + n, err := enc.Write(buf[:]) + assert.NoError(t, err) + assert.Equal(t, 1, n) + } + + assert.Equal(t, input, out.String()) + }) + t.Run("string-smallwrites", func(t *testing.T) { + t.Parallel() + var out strings.Builder + enc := NewReEncoder(&out, ReEncoderConfig{}) + + for i := 0; i < len(input); i++ { + n, err := enc.WriteString(input[i : i+1]) + assert.NoError(t, err) + assert.Equal(t, 1, n) + } + + assert.Equal(t, input, out.String()) + }) +} -- cgit v1.2.3 From 38989a9c4f69abfe04c3eb4ec3382be88802141c Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Wed, 15 Feb 2023 23:53:59 -0700 Subject: reencode: Fix .stackSize --- reencode.go | 4 ++-- reencode_test.go | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/reencode.go b/reencode.go index a33cc8f..fd848f8 100644 --- a/reencode.go +++ b/reencode.go @@ -364,8 +364,8 @@ func (enc *ReEncoder) popWriteBarrier() { func (enc *ReEncoder) stackSize() int { sz := enc.par.StackSize() - for _, barrier := range enc.barriers { - sz += barrier.stackSize + if len(enc.barriers) > 0 { + sz += enc.barriers[len(enc.barriers)-1].stackSize } return sz } diff --git a/reencode_test.go b/reencode_test.go index 82a1861..bc6d246 100644 --- a/reencode_test.go +++ b/reencode_test.go @@ -9,6 +9,8 @@ import ( "testing" "github.com/stretchr/testify/assert" + + "git.lukeshu.com/go/lowmemjson/internal/fastio" ) func TestReEncode(t *testing.T) { @@ -222,3 +224,17 @@ func TestReEncodeWriteSize(t *testing.T) { assert.Equal(t, input, out.String()) }) } + +func TestReEncoderStackSize(t *testing.T) { + t.Parallel() + + enc := NewReEncoder(fastio.Discard, ReEncoderConfig{}) + assert.Equal(t, 0, enc.stackSize()) + + for i := 0; i < 5; i++ { + assert.NoError(t, enc.WriteByte('[')) + assert.Equal(t, i+1, enc.stackSize()) + enc.pushWriteBarrier() + assert.Equal(t, i+2, enc.stackSize()) + } +} -- cgit v1.2.3 From dfc67cecbd95344d296c31b537fa3ae8aec8c292 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Tue, 14 Feb 2023 22:36:25 -0700 Subject: encode, reencode: Fix handling of invalid UTF-8 --- ReleaseNotes.md | 24 ++++++++ compat/json/compat.go | 5 +- compat/json/compat_test.go | 45 ++++++++++++--- compat/json/testcompat_test.go | 5 +- encode.go | 34 +++++------ encode_escape.go | 37 +++++++++--- errors.go | 3 +- internal/jsonstring/encode_string.go | 65 +++++++++++++++++++-- reencode.go | 107 +++++++++++++++++++++++------------ 9 files changed, 247 insertions(+), 78 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index d9a671a..b1647da 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -19,10 +19,34 @@ - Unicode: + + Feature: Encoder, ReEncoder: Add an `InvalidUTF8` + ReEncoderConfig option and `BackslashEscapeRawByte` + BackslashEscapeMode to allow emitted strings to contain + invalid UTF-8. + + + Change: EscapeDefault, EscapeDefaultNonHTMLSafe: No longer + force long Unicode `\uXXXX` sequences for the U+FFFD Unicode + replacement character. + + + Change: Encoder: Unless overridden by the BackslashEscaper, + now by default uses `\uXXXX` sequences when emitting the + U+FFFD Unicode replacement character in place of invalid + UTF-8. + + Bugfix: Encoder, ReEncoder: Fix an issue with decoding UTF-8 that when a codepoint straddles a write boundary it is interpreted as a sequence of U+FFFD runes. + + Bugfix: compat/json.Valid: Do not consider JSON containing + invalid UTF-8 to be valid (this is different than + `encoding/json` at the time of this writing; but I consider + that to be a bug in `encoding/json`; [go#58517][]). + + + Bugfix: compat/json.Compact, compat/json.Indent: Don't munge + invalid UTF-8 in strings; as `encoding/json` doesn't. + + [go#58517]: https://github.com/golang/go/issues/58517 + # v0.3.6 (2023-02-16) Theme: Architectural improvements diff --git a/compat/json/compat.go b/compat/json/compat.go index 1cdbf0b..d326514 100644 --- a/compat/json/compat.go +++ b/compat/json/compat.go @@ -160,6 +160,7 @@ func Compact(dst *bytes.Buffer, src []byte) error { start := dst.Len() err := reencode(dst, src, lowmemjson.ReEncoderConfig{ Compact: true, + InvalidUTF8: lowmemjson.InvalidUTF8Preserve, BackslashEscape: lowmemjson.EscapePreserve, }) if err != nil { @@ -173,6 +174,7 @@ func Indent(dst *bytes.Buffer, src []byte, prefix, indent string) error { err := reencode(dst, src, lowmemjson.ReEncoderConfig{ Indent: indent, Prefix: prefix, + InvalidUTF8: lowmemjson.InvalidUTF8Preserve, BackslashEscape: lowmemjson.EscapePreserve, }) if err != nil { @@ -183,7 +185,8 @@ func Indent(dst *bytes.Buffer, src []byte, prefix, indent string) error { func Valid(data []byte) bool { formatter := lowmemjson.NewReEncoder(io.Discard, lowmemjson.ReEncoderConfig{ - Compact: true, + Compact: true, + InvalidUTF8: lowmemjson.InvalidUTF8Error, }) if _, err := formatter.Write(data); err != nil { return false diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index d513c27..d989a4d 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -18,10 +18,11 @@ func TestCompatValid(t *testing.T) { Exp bool } testcases := map[string]testcase{ - "empty": {In: ``, Exp: false}, - "num": {In: `1`, Exp: true}, - "trunc": {In: `{`, Exp: false}, - "object": {In: `{}`, Exp: true}, + "empty": {In: ``, Exp: false}, + "num": {In: `1`, Exp: true}, + "trunc": {In: `{`, Exp: false}, + "object": {In: `{}`, Exp: true}, + "non-utf8": {In: "\"\x85\xcd\"", Exp: false}, // https://github.com/golang/go/issues/58517 } for tcName, tc := range testcases { tc := tc @@ -42,8 +43,9 @@ func TestCompatCompact(t *testing.T) { Err string } testcases := map[string]testcase{ - "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, - "object": {In: `{}`, Out: `{}`}, + "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, + "object": {In: `{}`, Out: `{}`}, + "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, } for tcName, tc := range testcases { tc := tc @@ -70,8 +72,9 @@ func TestCompatIndent(t *testing.T) { Err string } testcases := map[string]testcase{ - "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, - "object": {In: `{}`, Out: `{}`}, + "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, + "object": {In: `{}`, Out: `{}`}, + "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, } for tcName, tc := range testcases { tc := tc @@ -89,3 +92,29 @@ func TestCompatIndent(t *testing.T) { }) } } + +func TestCompatMarshal(t *testing.T) { + t.Parallel() + type testcase struct { + In any + Out string + Err string + } + testcases := map[string]testcase{ + "non-utf8": {In: "\x85\xcd", Out: "\"\\ufffd\\ufffd\""}, + "urc": {In: "\ufffd", Out: "\"\ufffd\""}, + } + for tcName, tc := range testcases { + tc := tc + t.Run(tcName, func(t *testing.T) { + t.Parallel() + out, err := Marshal(tc.In) + assert.Equal(t, tc.Out, string(out)) + if tc.Err == "" { + assert.NoError(t, err) + } else { + assert.EqualError(t, err, tc.Err) + } + }) + } +} diff --git a/compat/json/testcompat_test.go b/compat/json/testcompat_test.go index 42cbf5c..e89b4b4 100644 --- a/compat/json/testcompat_test.go +++ b/compat/json/testcompat_test.go @@ -8,6 +8,7 @@ import ( "bytes" "encoding/json" "io" + "reflect" _ "unsafe" "git.lukeshu.com/go/lowmemjson" @@ -59,13 +60,13 @@ type encodeState struct { } func (es *encodeState) string(str string, _ bool) { - if err := jsonstring.EncodeStringFromString(&es.Buffer, lowmemjson.EscapeDefault, str); err != nil { + if err := jsonstring.EncodeStringFromString(&es.Buffer, lowmemjson.EscapeDefault, 0, reflect.Value{}, str); err != nil { panic(err) } } func (es *encodeState) stringBytes(str []byte, _ bool) { - if err := jsonstring.EncodeStringFromBytes(&es.Buffer, lowmemjson.EscapeDefault, str); err != nil { + if err := jsonstring.EncodeStringFromBytes(&es.Buffer, lowmemjson.EscapeDefault, 0, reflect.Value{}, str); err != nil { panic(err) } } diff --git a/encode.go b/encode.go index 00d3dad..684cc75 100644 --- a/encode.go +++ b/encode.go @@ -87,7 +87,7 @@ func (enc *Encoder) Encode(obj any) (err error) { if escaper == nil { escaper = EscapeDefault } - if err := encode(enc.w, reflect.ValueOf(obj), escaper, false, 0, map[any]struct{}{}); err != nil { + if err := encode(enc.w, reflect.ValueOf(obj), escaper, enc.w.utf, false, 0, map[any]struct{}{}); err != nil { if rwe, ok := err.(*ReEncodeWriteError); ok { err = &EncodeWriteError{ Err: rwe.Err, @@ -108,7 +108,7 @@ func discardInt(_ int, err error) error { const startDetectingCyclesAfter = 1000 -func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote bool, cycleDepth uint, cycleSeen map[any]struct{}) error { +func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, utf InvalidUTF8Mode, quote bool, cycleDepth uint, cycleSeen map[any]struct{}) error { if !val.IsValid() { return discardInt(w.WriteString("null")) } @@ -197,7 +197,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo Err: err, } } - if err := jsonstring.EncodeStringFromBytes(w, escaper, text); err != nil { + if err := jsonstring.EncodeStringFromBytes(w, escaper, utf, val, text); err != nil { return err } default: @@ -309,14 +309,14 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo } else { if quote { var buf bytes.Buffer - if err := jsonstring.EncodeStringFromString(&buf, escaper, val.String()); err != nil { + if err := jsonstring.EncodeStringFromString(&buf, escaper, utf, val, val.String()); err != nil { return err } - if err := jsonstring.EncodeStringFromBytes(w, escaper, buf.Bytes()); err != nil { + if err := jsonstring.EncodeStringFromBytes(w, escaper, utf, val, buf.Bytes()); err != nil { return err } } else { - if err := jsonstring.EncodeStringFromString(w, escaper, val.String()); err != nil { + if err := jsonstring.EncodeStringFromString(w, escaper, utf, val, val.String()); err != nil { return err } } @@ -327,7 +327,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo return err } } else { - if err := encode(w, val.Elem(), escaper, quote, cycleDepth, cycleSeen); err != nil { + if err := encode(w, val.Elem(), escaper, utf, quote, cycleDepth, cycleSeen); err != nil { return err } } @@ -350,13 +350,13 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo } } empty = false - if err := jsonstring.EncodeStringFromString(w, escaper, field.Name); err != nil { + if err := jsonstring.EncodeStringFromString(w, escaper, utf, val, field.Name); err != nil { return err } if err := w.WriteByte(':'); err != nil { return err } - if err := encode(w, fVal, escaper, field.Quote, cycleDepth, cycleSeen); err != nil { + if err := encode(w, fVal, escaper, utf, field.Quote, cycleDepth, cycleSeen); err != nil { return err } } @@ -394,7 +394,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo for i := 0; iter.Next(); i++ { // TODO: Avoid buffering the map key var k strings.Builder - if err := encode(NewReEncoder(&k, ReEncoderConfig{BackslashEscape: escaper}), iter.Key(), escaper, false, cycleDepth, cycleSeen); err != nil { + if err := encode(NewReEncoder(&k, ReEncoderConfig{BackslashEscape: escaper, InvalidUTF8: utf}), iter.Key(), escaper, utf, false, cycleDepth, cycleSeen); err != nil { return err } kStr := k.String() @@ -403,7 +403,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo } if !strings.HasPrefix(kStr, `"`) { k.Reset() - if err := jsonstring.EncodeStringFromString(&k, escaper, kStr); err != nil { + if err := jsonstring.EncodeStringFromString(&k, escaper, utf, iter.Key(), kStr); err != nil { return err } kStr = k.String() @@ -427,7 +427,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo if err := w.WriteByte(':'); err != nil { return err } - if err := encode(w, kv.V, escaper, false, cycleDepth, cycleSeen); err != nil { + if err := encode(w, kv.V, escaper, utf, false, cycleDepth, cycleSeen); err != nil { return err } } @@ -491,12 +491,12 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo cycleSeen[ptr] = struct{}{} defer delete(cycleSeen, ptr) } - if err := encodeArray(w, val, escaper, cycleDepth, cycleSeen); err != nil { + if err := encodeArray(w, val, escaper, utf, cycleDepth, cycleSeen); err != nil { return err } } case reflect.Array: - if err := encodeArray(w, val, escaper, cycleDepth, cycleSeen); err != nil { + if err := encodeArray(w, val, escaper, utf, cycleDepth, cycleSeen); err != nil { return err } case reflect.Pointer: @@ -516,7 +516,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo cycleSeen[ptr] = struct{}{} defer delete(cycleSeen, ptr) } - if err := encode(w, val.Elem(), escaper, quote, cycleDepth, cycleSeen); err != nil { + if err := encode(w, val.Elem(), escaper, utf, quote, cycleDepth, cycleSeen); err != nil { return err } } @@ -529,7 +529,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo return nil } -func encodeArray(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, cycleDepth uint, cycleSeen map[any]struct{}) error { +func encodeArray(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, utf InvalidUTF8Mode, cycleDepth uint, cycleSeen map[any]struct{}) error { if err := w.WriteByte('['); err != nil { return err } @@ -540,7 +540,7 @@ func encodeArray(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, cycl return err } } - if err := encode(w, val.Index(i), escaper, false, cycleDepth, cycleSeen); err != nil { + if err := encode(w, val.Index(i), escaper, utf, false, cycleDepth, cycleSeen); err != nil { return err } } diff --git a/encode_escape.go b/encode_escape.go index 97da6e9..c9e2bc9 100644 --- a/encode_escape.go +++ b/encode_escape.go @@ -6,12 +6,29 @@ package lowmemjson import ( "fmt" - "unicode/utf8" "git.lukeshu.com/go/lowmemjson/internal/jsonstring" ) -// BackslashEscapeMode identifies one of the three ways that a +// InvalidUTF8Mode identifies one of the 3 ways that an Encoder or +// ReEncoder can behave when encountering invalid UTF-8 in a string +// value: +// +// - Replace the byte with the Unicode replacement character U+FFFD. +// +// - Allow the byte through to the string-encoder, with an +// escape-mode of BackslashEscapeRawByte. +// +// - Emit a syntax error. +type InvalidUTF8Mode = jsonstring.InvalidUTF8Mode + +const ( + InvalidUTF8Replace = jsonstring.InvalidUTF8Replace + InvalidUTF8Preserve = jsonstring.InvalidUTF8Preserve + InvalidUTF8Error = jsonstring.InvalidUTF8Error +) + +// BackslashEscapeMode identifies one of the four ways that a // character may be represented in a JSON string: // // - literally (no backslash escaping) @@ -20,12 +37,18 @@ import ( // single-character) // // - as a long Unicode `\uXXXX` backslash sequence +// +// - as a raw byte; this allows you to emit invalid JSON; JSON must +// be valid UTF-8, but this allows you to emit arbitrary binary +// data. If the character does not satisfy `utf8.RuneSelf <= char +// <= 0xFF`, then the encoder will panic. type BackslashEscapeMode = jsonstring.BackslashEscapeMode const ( BackslashEscapeNone = jsonstring.BackslashEscapeNone BackslashEscapeShort = jsonstring.BackslashEscapeShort BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode + BackslashEscapeRawByte = jsonstring.BackslashEscapeRawByte ) func hexToInt(c byte) rune { @@ -96,14 +119,13 @@ func EscapeHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode // behavior of encoding/json. // // It is like EscapeHTMLSafe, but also uses long Unicode `\uXXXX` -// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement -// character. +// sequences for `\b` and `\f` // // A ReEncoder uses EscapeDefault if a BackslashEscaper is not // specified. func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { switch c { - case '\b', '\f', utf8.RuneError: + case '\b', '\f': return BackslashEscapeUnicode default: return EscapeHTMLSafe(c, wasEscaped) @@ -115,11 +137,10 @@ func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { // SetEscapeHTML(false) called on it. // // It is like EscapeJSSafe, but also uses long Unicode `\uXXXX` -// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement -// character. +// sequences for `\b` and `\f`. func EscapeDefaultNonHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { switch c { - case '\b', '\f', utf8.RuneError: + case '\b', '\f': return BackslashEscapeUnicode default: return EscapeJSSafe(c, wasEscaped) diff --git a/errors.go b/errors.go index 516018c..da9de4d 100644 --- a/errors.go +++ b/errors.go @@ -142,7 +142,8 @@ func (e *EncodeWriteError) Unwrap() error { return e.Err } type EncodeTypeError = json.UnsupportedTypeError // An EncodeValueError is returned by Encode when attempting to encode -// an unsupported value (such as a datastructure with a cycle). +// an unsupported value (such as a datastructure with a cycle, or (if +// InvalidUTF8=InvalidUTF8Error) a string with invalid UTF-8). // // type UnsupportedValueError struct { // Value reflect.Value diff --git a/internal/jsonstring/encode_string.go b/internal/jsonstring/encode_string.go index fec2cc0..76bbb38 100644 --- a/internal/jsonstring/encode_string.go +++ b/internal/jsonstring/encode_string.go @@ -5,14 +5,25 @@ package jsonstring import ( + "encoding/json" "fmt" "io" + "reflect" "unicode/utf8" "git.lukeshu.com/go/lowmemjson/internal/fastio" "git.lukeshu.com/go/lowmemjson/internal/fastio/noescape" ) +// InvalidUTF8Mode is describe in the main lowmemjson package docs. +type InvalidUTF8Mode uint8 + +const ( + InvalidUTF8Replace InvalidUTF8Mode = iota + InvalidUTF8Preserve + InvalidUTF8Error +) + // BackslashEscapeMode is describe in the main lowmemjson package // docs. type BackslashEscapeMode uint8 @@ -21,6 +32,7 @@ const ( BackslashEscapeNone BackslashEscapeMode = iota BackslashEscapeShort BackslashEscapeUnicode + BackslashEscapeRawByte ) // BackslashEscaper is describe in the main lowmemjson package docs. @@ -96,19 +108,45 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err default: // obey return writeStringUnicodeEscape(w, c) } + case BackslashEscapeRawByte: + switch { + case c < utf8.RuneSelf: + panic(fmt.Errorf("escaper returned BackslashEscapeRawByte for a character=%q < utf8.RuneSelf", c)) + case c > 0xFF: + panic(fmt.Errorf("escaper returned BackslashEscapeRawByte for a character=%q > 0xFF", c)) + default: + return w.WriteByte(byte(c)) + } default: - panic("escaper returned an invalid escape mode") + panic(fmt.Errorf("escaper returned an invalid escape mode=%d", escape)) } } -func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, str string) error { +func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, utf InvalidUTF8Mode, val reflect.Value, str string) error { if err := w.WriteByte('"'); err != nil { return err } - for _, c := range str { - if err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil { + for i := 0; i < len(str); { + escaped := BackslashEscapeNone + c, size := utf8.DecodeRuneInString(str[i:]) + if c == utf8.RuneError && size == 1 { + switch utf { + case InvalidUTF8Replace: + escaped = BackslashEscapeUnicode + case InvalidUTF8Preserve: + escaped = BackslashEscapeRawByte + c = rune(str[i]) + case InvalidUTF8Error: + return &json.UnsupportedValueError{ + Value: val, + Str: fmt.Sprintf("invalid UTF-8 at byte offset %d: %#02x", i, str[i]), + } + } + } + if err := WriteStringChar(w, c, escaper(c, escaped)); err != nil { return err } + i += size } if err := w.WriteByte('"'); err != nil { return err @@ -116,13 +154,28 @@ func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, str st return nil } -func EncodeStringFromBytes(w fastio.AllWriter, escaper BackslashEscaper, str []byte) error { +func EncodeStringFromBytes(w fastio.AllWriter, escaper BackslashEscaper, utf InvalidUTF8Mode, val reflect.Value, str []byte) error { if err := w.WriteByte('"'); err != nil { return err } for i := 0; i < len(str); { + escaped := BackslashEscapeNone c, size := utf8.DecodeRune(str[i:]) - if err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil { + if c == utf8.RuneError && size == 1 { + switch utf { + case InvalidUTF8Replace: + escaped = BackslashEscapeUnicode + case InvalidUTF8Preserve: + escaped = BackslashEscapeRawByte + c = rune(str[i]) + case InvalidUTF8Error: + return &json.UnsupportedValueError{ + Value: val, + Str: fmt.Sprintf("invalid UTF-8 at byte offset %d: %#02x", i, str[i]), + } + } + } + if err := WriteStringChar(w, c, escaper(c, escaped)); err != nil { return err } i += size diff --git a/reencode.go b/reencode.go index fd848f8..1a9999b 100644 --- a/reencode.go +++ b/reencode.go @@ -54,6 +54,13 @@ type ReEncoderConfig struct { // this is different than the usual behavior. ForceTrailingNewlines bool + // A JSON document is specified to be a sequence of Unicode + // codepoints; InvalidUTF8 controls how the *ReEncoder behaves + // when it encounters invalid UTF-8 bytes in a JSON string + // (i.e. the string is not representable as a sequence of + // Unicode codepoints, and thus the document is invalid JSON). + InvalidUTF8 InvalidUTF8Mode + // Returns whether a given character in a string should be // backslash-escaped. The bool argument is whether it was // \u-escaped in the input. This does not affect characters @@ -119,6 +126,7 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder { return &ReEncoder{ out: module, esc: escaper, + utf: cfg.InvalidUTF8, allowMultipleValues: cfg.AllowMultipleValues, } } @@ -134,6 +142,7 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder { type ReEncoder struct { out reEncoderModule esc BackslashEscaper + utf InvalidUTF8Mode allowMultipleValues bool // state: .Write's/.WriteString's/.WriteRune's utf8-decoding buffer @@ -169,43 +178,54 @@ var ( _ io.Closer = (*ReEncoder)(nil) ) -func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full bool) { +func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full, isRune bool) { + var tmp []byte if pos < enc.bufLen { - var tmp [utf8.UTFMax]byte - n := copy(tmp[:], enc.buf[pos:enc.bufLen]) - n += copy(tmp[n:], str) - c, size := utf8.DecodeRune(tmp[:n]) - if c == utf8.RuneError && size <= 1 { - return c, size, utf8.FullRune(tmp[:n]) - } - return c, size, true + var buf [utf8.UTFMax]byte + n := copy(buf[:], enc.buf[pos:enc.bufLen]) + n += copy(buf[n:], str) + tmp = buf[:n] } else { - tmp := str[pos-enc.bufLen:] - c, size := utf8.DecodeRune(tmp) - if c == utf8.RuneError && size <= 1 { - return c, size, utf8.FullRune(tmp) - } - return c, size, true + tmp = str[pos-enc.bufLen:] + } + c, size = utf8.DecodeRune(tmp) + switch { + case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp): + return c, size, false, true + case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace: + return rune(tmp[0]), 1, true, false + default: + return c, size, true, true } } -func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full bool) { +func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full, isRune bool) { if pos < enc.bufLen { - var tmp [utf8.UTFMax]byte - n := copy(tmp[:], enc.buf[pos:enc.bufLen]) - n += copy(tmp[n:], str) - c, size := utf8.DecodeRune(tmp[:n]) - if c == utf8.RuneError && size <= 1 { - return c, size, utf8.FullRune(tmp[:n]) + var buf [utf8.UTFMax]byte + var tmp []byte + n := copy(buf[:], enc.buf[pos:enc.bufLen]) + n += copy(buf[n:], str) + tmp = buf[:n] + c, size = utf8.DecodeRune(tmp) + switch { + case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp): + return c, size, false, true + case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace: + return rune(tmp[0]), 1, true, false + default: + return c, size, true, true } - return c, size, true } else { tmp := str[pos-enc.bufLen:] c, size := utf8.DecodeRuneInString(tmp) - if c == utf8.RuneError && size <= 1 { - return c, size, utf8.FullRuneInString(tmp) + switch { + case c == utf8.RuneError && size <= 1 && !utf8.FullRuneInString(tmp): + return c, size, false, true + case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace: + return rune(tmp[0]), 1, true, false + default: + return c, size, true, true } - return c, size, true } } @@ -223,7 +243,7 @@ func (enc *ReEncoder) Write(str []byte) (int, error) { } var n int for { - c, size, full := enc.getRuneFromBytes(str, n) + c, size, full, isRune := enc.getRuneFromBytes(str, n) if !full { if n < enc.bufLen { l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) @@ -234,7 +254,13 @@ func (enc *ReEncoder) Write(str []byte) (int, error) { } return len(str), nil } - enc.handleRune(c, size) + if enc.utf == InvalidUTF8Error && !isRune { + return n, &ReEncodeSyntaxError{ + Offset: enc.inputPos, + Err: fmt.Errorf("invalid UTF-8: %#02x", c), + } + } + enc.handleRune(c, size, isRune) if enc.err != nil { return n, enc.err } @@ -250,7 +276,7 @@ func (enc *ReEncoder) WriteString(str string) (int, error) { } var n int for { - c, size, full := enc.getRuneFromString(str, n) + c, size, full, isRune := enc.getRuneFromString(str, n) if !full { if n < enc.bufLen { l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) @@ -261,7 +287,13 @@ func (enc *ReEncoder) WriteString(str string) (int, error) { } return len(str), nil } - enc.handleRune(c, size) + if enc.utf == InvalidUTF8Error && !isRune { + return n, &ReEncodeSyntaxError{ + Offset: enc.inputPos, + Err: fmt.Errorf("invalid UTF-8: %#02x", c), + } + } + enc.handleRune(c, size, isRune) if enc.err != nil { return n, enc.err } @@ -298,7 +330,7 @@ func (enc *ReEncoder) Close() error { return enc.err } if len(enc.barriers) == 0 { - if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize()); err != nil { + if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize(), true); err != nil { enc.err = &ReEncodeWriteError{ Err: err, Offset: enc.inputPos, @@ -312,7 +344,8 @@ func (enc *ReEncoder) Close() error { return nil } -func (enc *ReEncoder) handleRune(c rune, size int) { +// isRune=false indicates that 'c' is a raw byte from invalid UTF-8. +func (enc *ReEncoder) handleRune(c rune, size int, isRune bool) { t, err := enc.par.HandleRune(c) if err != nil { enc.err = &ReEncodeSyntaxError{ @@ -321,7 +354,7 @@ func (enc *ReEncoder) handleRune(c rune, size int) { } return } - if err := enc.handleRuneType(c, t, enc.stackSize()); err != nil { + if err := enc.handleRuneType(c, t, enc.stackSize(), isRune); err != nil { enc.err = &ReEncodeWriteError{ Err: err, Offset: enc.inputPos, @@ -370,7 +403,7 @@ func (enc *ReEncoder) stackSize() int { return sz } -func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int) error { +func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int, isRune bool) error { switch t { case jsonparse.RuneTypeStringEsc, jsonparse.RuneTypeStringEscU: return nil @@ -410,6 +443,10 @@ func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int if t > jsonparse.RuneTypeEOF { panic(fmt.Errorf("should not happen: handleRune called with %#v", t)) } - return enc.out.HandleRune(c, t, BackslashEscapeNone, stackSize) + esc := BackslashEscapeNone + if !isRune { + esc = BackslashEscapeRawByte + } + return enc.out.HandleRune(c, t, esc, stackSize) } } -- cgit v1.2.3 From 1a5b0561f53441d8a259a5096281699b5af16a6c Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Thu, 16 Feb 2023 16:53:53 -0700 Subject: reencode: Add CompactFloats --- ReleaseNotes.md | 9 ++++++++- compat/json/compat_test.go | 3 +++ reencode.go | 10 ++++++++-- reencode_test.go | 6 ++++-- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index b1647da..ae147b1 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -4,11 +4,15 @@ User-facing changes: - - General bugfixes: + - General Changes: + Encoder, ReEncoder: Now correctly trims unnecessary the trailing '0's from the fraction-part when compacting numbers. + + ReEncoder: No longer compact floating-point numbers by + default, add a `CompactFloats` ReEncoderConfig option to + control this. + - Compatibility bugfixes: + compat/json.Valid: No longer considers truncated JSON @@ -17,6 +21,9 @@ + compat/json.Compact, compat/json.Indent: Don't write to the destination buffer if there is a syntax error. + + compat/json.Compact, compat/json.Indent: No longer compact + floating-point numbers; as `encoding/json` doesn't. + - Unicode: + Feature: Encoder, ReEncoder: Add an `InvalidUTF8` diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index d989a4d..128bd1b 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -46,6 +46,7 @@ func TestCompatCompact(t *testing.T) { "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, "object": {In: `{}`, Out: `{}`}, "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, + "float": {In: `1.200e003`, Out: `1.200e003`}, } for tcName, tc := range testcases { tc := tc @@ -75,6 +76,7 @@ func TestCompatIndent(t *testing.T) { "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, "object": {In: `{}`, Out: `{}`}, "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, + "float": {In: `1.200e003`, Out: `1.200e003`}, } for tcName, tc := range testcases { tc := tc @@ -103,6 +105,7 @@ func TestCompatMarshal(t *testing.T) { testcases := map[string]testcase{ "non-utf8": {In: "\x85\xcd", Out: "\"\\ufffd\\ufffd\""}, "urc": {In: "\ufffd", Out: "\"\ufffd\""}, + "float": {In: 1.2e3, Out: `1200`}, } for tcName, tc := range testcases { tc := tc diff --git a/reencode.go b/reencode.go index 1a9999b..1943b9c 100644 --- a/reencode.go +++ b/reencode.go @@ -54,6 +54,10 @@ type ReEncoderConfig struct { // this is different than the usual behavior. ForceTrailingNewlines bool + // CompactFloats causes the *ReEncoder to trim unnecessary '0' + // digits from floating-point number values. + CompactFloats bool + // A JSON document is specified to be a sequence of Unicode // codepoints; InvalidUTF8 controls how the *ReEncoder behaves // when it encounters invalid UTF-8 bytes in a JSON string @@ -109,8 +113,10 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder { } // Numbers - module = &reEncodeCompactNum{ - out: module, + if cfg.CompactFloats { + module = &reEncodeCompactNum{ + out: module, + } } // Strings diff --git a/reencode_test.go b/reencode_test.go index bc6d246..715e976 100644 --- a/reencode_test.go +++ b/reencode_test.go @@ -135,7 +135,8 @@ func TestReEncode(t *testing.T) { }, "numbers": { enc: ReEncoderConfig{ - Compact: true, + Compact: true, + CompactFloats: true, }, in: []any{ Number("1.200e003"), @@ -144,7 +145,8 @@ func TestReEncode(t *testing.T) { }, "numbers-zero": { enc: ReEncoderConfig{ - Compact: true, + Compact: true, + CompactFloats: true, }, in: []any{ Number("1.000e000"), -- cgit v1.2.3 From 2eb60b8be25a4b0fe3f1c5d5ca302e7e68190bad Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Thu, 16 Feb 2023 17:20:41 -0700 Subject: compat/json: Don't do actual JSON parsing in HTMLEscape --- ReleaseNotes.md | 5 +++++ compat/json/compat.go | 21 ++++++++++++++++++++- compat/json/compat_test.go | 21 +++++++++++++++++++++ internal/jsonstring/encode_string.go | 6 +++--- 4 files changed, 49 insertions(+), 4 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index ae147b1..c949fd6 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -24,6 +24,11 @@ + compat/json.Compact, compat/json.Indent: No longer compact floating-point numbers; as `encoding/json` doesn't. + + compat/json.HTMLEscape: Just look for problematic UTF-8 runes, + don't actually parse as JSON. This is consistent with the + function's lack of an `error` return value, and with the + behavior of `encoding/json`. + - Unicode: + Feature: Encoder, ReEncoder: Add an `InvalidUTF8` diff --git a/compat/json/compat.go b/compat/json/compat.go index d326514..edc6908 100644 --- a/compat/json/compat.go +++ b/compat/json/compat.go @@ -11,10 +11,13 @@ import ( "bytes" "encoding/json" "errors" + "fmt" "io" "strconv" + "unicode/utf8" "git.lukeshu.com/go/lowmemjson" + "git.lukeshu.com/go/lowmemjson/internal/jsonstring" ) //nolint:stylecheck // ST1021 False positive; these aren't comments on individual types. @@ -144,7 +147,23 @@ func convertReEncodeError(err error) error { } func HTMLEscape(dst *bytes.Buffer, src []byte) { - _, _ = lowmemjson.NewReEncoder(dst, lowmemjson.ReEncoderConfig{}).Write(src) + for n := 0; n < len(src); { + c, size := utf8.DecodeRune(src[n:]) + if c == utf8.RuneError && size == 1 { + dst.WriteByte(src[n]) + } else { + mode := lowmemjson.EscapeHTMLSafe(c, lowmemjson.BackslashEscapeNone) + switch mode { + case lowmemjson.BackslashEscapeNone: + dst.WriteRune(c) + case lowmemjson.BackslashEscapeUnicode: + _ = jsonstring.WriteStringUnicodeEscape(dst, c) + default: + panic(fmt.Errorf("lowmemjson.EscapeHTMLSafe returned an unexpected escape mode=%d", mode)) + } + } + n += size + } } func reencode(dst io.Writer, src []byte, cfg lowmemjson.ReEncoderConfig) error { diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index 128bd1b..0c14a60 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -11,6 +11,27 @@ import ( "github.com/stretchr/testify/assert" ) +func TestCompatHTMLEscape(t *testing.T) { + t.Parallel() + type testcase struct { + In string + Out string + } + testcases := map[string]testcase{ + "invalid": {In: `x`, Out: `x`}, + } + for tcName, tc := range testcases { + tc := tc + t.Run(tcName, func(t *testing.T) { + t.Parallel() + t.Logf("in=%q", tc.In) + var dst bytes.Buffer + HTMLEscape(&dst, []byte(tc.In)) + assert.Equal(t, tc.Out, dst.String()) + }) + } +} + func TestCompatValid(t *testing.T) { t.Parallel() type testcase struct { diff --git a/internal/jsonstring/encode_string.go b/internal/jsonstring/encode_string.go index 76bbb38..2488cb2 100644 --- a/internal/jsonstring/encode_string.go +++ b/internal/jsonstring/encode_string.go @@ -38,7 +38,7 @@ const ( // BackslashEscaper is describe in the main lowmemjson package docs. type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode -func writeStringUnicodeEscape(w io.Writer, c rune) error { +func WriteStringUnicodeEscape(w io.Writer, c rune) error { const alphabet = "0123456789abcdef" buf := [6]byte{ '\\', @@ -84,7 +84,7 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err case '\b', '\f', '\n', '\r', '\t': // short-escape if possible return writeStringShortEscape(w, c) default: - return writeStringUnicodeEscape(w, c) + return WriteStringUnicodeEscape(w, c) } case c == '"' || c == '\\': // override, gotta escape these return writeStringShortEscape(w, c) @@ -106,7 +106,7 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err _, err := w.WriteRune(c) return err default: // obey - return writeStringUnicodeEscape(w, c) + return WriteStringUnicodeEscape(w, c) } case BackslashEscapeRawByte: switch { -- cgit v1.2.3 From a87d6cbbb51a19071c5c742ef3c91bbb90a727c6 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Thu, 16 Feb 2023 17:32:17 -0700 Subject: compat/json: Indent: Preserve trailing whitespace --- ReleaseNotes.md | 3 +++ compat/json/compat.go | 21 ++++++++++++++++++++- compat/json/compat_test.go | 7 +++++++ compat/json/testcompat_test.go | 9 --------- 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index c949fd6..73df694 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -29,6 +29,9 @@ function's lack of an `error` return value, and with the behavior of `encoding/json`. + + compat/json.Indent: Preserve trailing whitespace, same as + `encoding/json`. + - Unicode: + Feature: Encoder, ReEncoder: Add an `InvalidUTF8` diff --git a/compat/json/compat.go b/compat/json/compat.go index edc6908..d33f278 100644 --- a/compat/json/compat.go +++ b/compat/json/compat.go @@ -188,6 +188,15 @@ func Compact(dst *bytes.Buffer, src []byte) error { return err } +func isSpace(c byte) bool { + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + return true + default: + return false + } +} + func Indent(dst *bytes.Buffer, src []byte, prefix, indent string) error { start := dst.Len() err := reencode(dst, src, lowmemjson.ReEncoderConfig{ @@ -198,8 +207,18 @@ func Indent(dst *bytes.Buffer, src []byte, prefix, indent string) error { }) if err != nil { dst.Truncate(start) + return err } - return err + + // Preserve trailing whitespace. + lastNonWS := len(src) - 1 + for ; lastNonWS >= 0 && isSpace(src[lastNonWS]); lastNonWS-- { + } + if _, err := dst.Write(src[lastNonWS+1:]); err != nil { + return err + } + + return nil } func Valid(data []byte) bool { diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index 0c14a60..c83ca7e 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -98,6 +98,13 @@ func TestCompatIndent(t *testing.T) { "object": {In: `{}`, Out: `{}`}, "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, "float": {In: `1.200e003`, Out: `1.200e003`}, + "tailws0": {In: `0`, Out: `0`}, + "tailws1": {In: `0 `, Out: `0 `}, + "tailws2": {In: `0 `, Out: `0 `}, + "tailws3": {In: "0\n", Out: "0\n"}, + "headws1": {In: ` 0`, Out: `0`}, + "objws1": {In: `{"a" : 1}`, Out: "{\n>.\"a\": 1\n>}"}, + "objws2": {In: "{\"a\"\n:\n1}", Out: "{\n>.\"a\": 1\n>}"}, } for tcName, tc := range testcases { tc := tc diff --git a/compat/json/testcompat_test.go b/compat/json/testcompat_test.go index e89b4b4..73153d9 100644 --- a/compat/json/testcompat_test.go +++ b/compat/json/testcompat_test.go @@ -46,15 +46,6 @@ const ( startDetectingCyclesAfter = 1000 ) -func isSpace(c byte) bool { - switch c { - case 0x0020, 0x000A, 0x000D, 0x0009: - return true - default: - return false - } -} - type encodeState struct { bytes.Buffer } -- cgit v1.2.3 From 00187950437a10952b82353405e5ba4b4515fb29 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Thu, 16 Feb 2023 19:06:46 -0700 Subject: reencode: Don't normalize the capitalization of \uXXXX hex escapes --- ReleaseNotes.md | 5 +++ compat/json/compat.go | 2 +- compat/json/compat_test.go | 54 +++++++++++++++++++------------- encode_escape.go | 39 +++++++++++++++++++++-- internal/jsonstring/encode_string.go | 60 ++++++++++++++++++++++++++---------- reencode.go | 3 +- 6 files changed, 121 insertions(+), 42 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index 73df694..a8496e0 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -39,6 +39,11 @@ BackslashEscapeMode to allow emitted strings to contain invalid UTF-8. + + Feature: ReEncoder: No longer unconditionally normalizes + `\uXXXX` hex characters to lower-case; now this is controlled + by the `BackslashEscaper` (and the default is now to leave the + capitalization alone). + + Change: EscapeDefault, EscapeDefaultNonHTMLSafe: No longer force long Unicode `\uXXXX` sequences for the U+FFFD Unicode replacement character. diff --git a/compat/json/compat.go b/compat/json/compat.go index d33f278..3a9bd6c 100644 --- a/compat/json/compat.go +++ b/compat/json/compat.go @@ -157,7 +157,7 @@ func HTMLEscape(dst *bytes.Buffer, src []byte) { case lowmemjson.BackslashEscapeNone: dst.WriteRune(c) case lowmemjson.BackslashEscapeUnicode: - _ = jsonstring.WriteStringUnicodeEscape(dst, c) + _ = jsonstring.WriteStringUnicodeEscape(dst, c, mode) default: panic(fmt.Errorf("lowmemjson.EscapeHTMLSafe returned an unexpected escape mode=%d", mode)) } diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index c83ca7e..29a8b37 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -18,7 +18,10 @@ func TestCompatHTMLEscape(t *testing.T) { Out string } testcases := map[string]testcase{ - "invalid": {In: `x`, Out: `x`}, + "invalid": {In: `x`, Out: `x`}, + "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`}, + "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`}, + "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`}, } for tcName, tc := range testcases { tc := tc @@ -39,11 +42,14 @@ func TestCompatValid(t *testing.T) { Exp bool } testcases := map[string]testcase{ - "empty": {In: ``, Exp: false}, - "num": {In: `1`, Exp: true}, - "trunc": {In: `{`, Exp: false}, - "object": {In: `{}`, Exp: true}, - "non-utf8": {In: "\"\x85\xcd\"", Exp: false}, // https://github.com/golang/go/issues/58517 + "empty": {In: ``, Exp: false}, + "num": {In: `1`, Exp: true}, + "trunc": {In: `{`, Exp: false}, + "object": {In: `{}`, Exp: true}, + "non-utf8": {In: "\"\x85\xcd\"", Exp: false}, // https://github.com/golang/go/issues/58517 + "hex-lower": {In: `"\uabcd"`, Exp: true}, + "hex-upper": {In: `"\uABCD"`, Exp: true}, + "hex-mixed": {In: `"\uAbCd"`, Exp: true}, } for tcName, tc := range testcases { tc := tc @@ -64,10 +70,13 @@ func TestCompatCompact(t *testing.T) { Err string } testcases := map[string]testcase{ - "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, - "object": {In: `{}`, Out: `{}`}, - "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, - "float": {In: `1.200e003`, Out: `1.200e003`}, + "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, + "object": {In: `{}`, Out: `{}`}, + "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, + "float": {In: `1.200e003`, Out: `1.200e003`}, + "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`}, + "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`}, + "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`}, } for tcName, tc := range testcases { tc := tc @@ -94,17 +103,20 @@ func TestCompatIndent(t *testing.T) { Err string } testcases := map[string]testcase{ - "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, - "object": {In: `{}`, Out: `{}`}, - "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, - "float": {In: `1.200e003`, Out: `1.200e003`}, - "tailws0": {In: `0`, Out: `0`}, - "tailws1": {In: `0 `, Out: `0 `}, - "tailws2": {In: `0 `, Out: `0 `}, - "tailws3": {In: "0\n", Out: "0\n"}, - "headws1": {In: ` 0`, Out: `0`}, - "objws1": {In: `{"a" : 1}`, Out: "{\n>.\"a\": 1\n>}"}, - "objws2": {In: "{\"a\"\n:\n1}", Out: "{\n>.\"a\": 1\n>}"}, + "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, + "object": {In: `{}`, Out: `{}`}, + "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, + "float": {In: `1.200e003`, Out: `1.200e003`}, + "tailws0": {In: `0`, Out: `0`}, + "tailws1": {In: `0 `, Out: `0 `}, + "tailws2": {In: `0 `, Out: `0 `}, + "tailws3": {In: "0\n", Out: "0\n"}, + "headws1": {In: ` 0`, Out: `0`}, + "objws1": {In: `{"a" : 1}`, Out: "{\n>.\"a\": 1\n>}"}, + "objws2": {In: "{\"a\"\n:\n1}", Out: "{\n>.\"a\": 1\n>}"}, + "hex-lower": {In: `"\uabcd"`, Out: `"\uabcd"`}, + "hex-upper": {In: `"\uABCD"`, Out: `"\uABCD"`}, + "hex-mixed": {In: `"\uAbCd"`, Out: `"\uAbCd"`}, } for tcName, tc := range testcases { tc := tc diff --git a/encode_escape.go b/encode_escape.go index c9e2bc9..664c762 100644 --- a/encode_escape.go +++ b/encode_escape.go @@ -36,7 +36,8 @@ const ( // - as a short "well-known" `\X` backslash sequence (where `X` is a // single-character) // -// - as a long Unicode `\uXXXX` backslash sequence +// - as a long Unicode `\uXXXX` backslash sequence (with 16 +// permutations of capitalization) // // - as a raw byte; this allows you to emit invalid JSON; JSON must // be valid UTF-8, but this allows you to emit arbitrary binary @@ -47,8 +48,29 @@ type BackslashEscapeMode = jsonstring.BackslashEscapeMode const ( BackslashEscapeNone = jsonstring.BackslashEscapeNone BackslashEscapeShort = jsonstring.BackslashEscapeShort - BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode BackslashEscapeRawByte = jsonstring.BackslashEscapeRawByte + + BackslashEscapeUnicodeXXXX = jsonstring.BackslashEscapeUnicodeXXXX + BackslashEscapeUnicodeXXXx = jsonstring.BackslashEscapeUnicodeXXXx + BackslashEscapeUnicodeXXxX = jsonstring.BackslashEscapeUnicodeXXxX + BackslashEscapeUnicodeXXxx = jsonstring.BackslashEscapeUnicodeXXxx + BackslashEscapeUnicodeXxXX = jsonstring.BackslashEscapeUnicodeXxXX + BackslashEscapeUnicodeXxXx = jsonstring.BackslashEscapeUnicodeXxXx + BackslashEscapeUnicodeXxxX = jsonstring.BackslashEscapeUnicodeXxxX + BackslashEscapeUnicodeXxxx = jsonstring.BackslashEscapeUnicodeXxxx + BackslashEscapeUnicodexXXX = jsonstring.BackslashEscapeUnicodexXXX + BackslashEscapeUnicodexXXx = jsonstring.BackslashEscapeUnicodexXXx + BackslashEscapeUnicodexXxX = jsonstring.BackslashEscapeUnicodexXxX + BackslashEscapeUnicodexXxx = jsonstring.BackslashEscapeUnicodexXxx + BackslashEscapeUnicodexxXX = jsonstring.BackslashEscapeUnicodexxXX + BackslashEscapeUnicodexxXx = jsonstring.BackslashEscapeUnicodexxXx + BackslashEscapeUnicodexxxX = jsonstring.BackslashEscapeUnicodexxxX + BackslashEscapeUnicodexxxx = jsonstring.BackslashEscapeUnicodexxxx + + BackslashEscapeUnicodeMin = jsonstring.BackslashEscapeUnicodeMin + BackslashEscapeUnicodeMax = jsonstring.BackslashEscapeUnicodeMax + + BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode // back-compat ) func hexToInt(c byte) rune { @@ -72,13 +94,24 @@ func hexToRune(a, b, c, d byte) rune { hexToInt(d)<<0 } +func hexToMode(a, b, c, d byte) BackslashEscapeMode { + // The 0b0010_0000 bit is the ASCII "lowercase bit". + return BackslashEscapeUnicodeMin + BackslashEscapeMode(0| + ((a&0b0010_0000)>>2)| + ((b&0b0010_0000)>>3)| + ((c&0b0010_0000)>>4)| + ((d&0b0010_0000)>>5)) +} + // A BackslashEscaper controls how a ReEncoder emits a character in a // JSON string. The `rune` argument is the character being // considered, and the `BackslashEscapeMode` argument is how it was // originally encoded in the input. // // The ReEncoder will panic if a BackslashEscaper returns an unknown -// BackslashEscapeMode. +// BackslashEscapeMode. However, a BackslashEscaper should be +// permissive of BackslashEscapeModes it doesn't recognize; it is safe +// to just return them unmodified. type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode // EscapePreserve is a BackslashEscaper that preserves the original diff --git a/internal/jsonstring/encode_string.go b/internal/jsonstring/encode_string.go index 2488cb2..1416b3e 100644 --- a/internal/jsonstring/encode_string.go +++ b/internal/jsonstring/encode_string.go @@ -31,22 +31,49 @@ type BackslashEscapeMode uint8 const ( BackslashEscapeNone BackslashEscapeMode = iota BackslashEscapeShort - BackslashEscapeUnicode BackslashEscapeRawByte + + // It is significant to the implementation that if X=binary-0 + // and x=binary-1, then these "BackslashEscapeUnicode" + // constants are counting in-order from 0 to 15. + + BackslashEscapeUnicodeXXXX + BackslashEscapeUnicodeXXXx + BackslashEscapeUnicodeXXxX + BackslashEscapeUnicodeXXxx + BackslashEscapeUnicodeXxXX + BackslashEscapeUnicodeXxXx + BackslashEscapeUnicodeXxxX + BackslashEscapeUnicodeXxxx + BackslashEscapeUnicodexXXX + BackslashEscapeUnicodexXXx + BackslashEscapeUnicodexXxX + BackslashEscapeUnicodexXxx + BackslashEscapeUnicodexxXX + BackslashEscapeUnicodexxXx + BackslashEscapeUnicodexxxX + BackslashEscapeUnicodexxxx + + BackslashEscapeUnicodeMin = BackslashEscapeUnicodeXXXX + BackslashEscapeUnicodeMax = BackslashEscapeUnicodexxxx + + BackslashEscapeUnicode = BackslashEscapeUnicodexxxx // back-compat ) // BackslashEscaper is describe in the main lowmemjson package docs. type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode -func WriteStringUnicodeEscape(w io.Writer, c rune) error { - const alphabet = "0123456789abcdef" +func WriteStringUnicodeEscape(w io.Writer, c rune, mode BackslashEscapeMode) error { + const alphabet = "0123456789ABCDEF" + _mode := byte(mode - BackslashEscapeUnicodeMin) buf := [6]byte{ '\\', 'u', - alphabet[(c>>12)&0xf], - alphabet[(c>>8)&0xf], - alphabet[(c>>4)&0xf], - alphabet[(c>>0)&0xf], + // The 0b0010_0000 bit is the ASCII "lowercase bit". + alphabet[(c>>12)&0xf] | ((_mode << 2) & 0b0010_0000), + alphabet[(c>>8)&0xf] | ((_mode << 3) & 0b0010_0000), + alphabet[(c>>4)&0xf] | ((_mode << 4) & 0b0010_0000), + alphabet[(c>>0)&0xf] | ((_mode << 5) & 0b0010_0000), } _, err := noescape.Write(w, buf[:]) return err @@ -84,7 +111,7 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err case '\b', '\f', '\n', '\r', '\t': // short-escape if possible return writeStringShortEscape(w, c) default: - return WriteStringUnicodeEscape(w, c) + return WriteStringUnicodeEscape(w, c, BackslashEscapeUnicode) } case c == '"' || c == '\\': // override, gotta escape these return writeStringShortEscape(w, c) @@ -100,14 +127,6 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err _, err := w.WriteRune(c) return err } - case BackslashEscapeUnicode: - switch { - case c > 0xFFFF: // override, can't escape these (TODO: unless we use UTF-16 surrogates?) - _, err := w.WriteRune(c) - return err - default: // obey - return WriteStringUnicodeEscape(w, c) - } case BackslashEscapeRawByte: switch { case c < utf8.RuneSelf: @@ -118,6 +137,15 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err return w.WriteByte(byte(c)) } default: + if BackslashEscapeUnicodeMin <= escape && escape <= BackslashEscapeUnicodeMax { + switch { + case c > 0xFFFF: // override, can't escape these (TODO: unless we use UTF-16 surrogates?) + _, err := w.WriteRune(c) + return err + default: // obey + return WriteStringUnicodeEscape(w, c, escape) + } + } panic(fmt.Errorf("escaper returned an invalid escape mode=%d", escape)) } } diff --git a/reencode.go b/reencode.go index 1943b9c..7439bf0 100644 --- a/reencode.go +++ b/reencode.go @@ -441,8 +441,9 @@ func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int enc.uhex[2] = byte(c) return nil case jsonparse.RuneTypeStringEscUD: + mode := hexToMode(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c)) c = hexToRune(enc.uhex[0], enc.uhex[1], enc.uhex[2], byte(c)) - return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, BackslashEscapeUnicode, stackSize) + return enc.out.HandleRune(c, jsonparse.RuneTypeStringChar, mode, stackSize) case jsonparse.RuneTypeError: panic(fmt.Errorf("should not happen: handleRune called with %#v", t)) default: -- cgit v1.2.3 From 49ee8be679add0bd3cf08a2669331b3be7a835f8 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 17 Feb 2023 19:21:37 -0700 Subject: compat/json: Correctly handle syntax-error-in-decode --- ReleaseNotes.md | 14 ++++++++ compat/json/compat.go | 82 ++++++++++++++++++++++++++++++++++++++++----- compat/json/compat_test.go | 78 ++++++++++++++++++++++++++++++++++++++++++ decode.go | 2 ++ decode_scan.go | 6 ++++ internal/jsonparse/parse.go | 15 +++++++++ 6 files changed, 188 insertions(+), 9 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index a8496e0..5e8dab7 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -13,6 +13,11 @@ default, add a `CompactFloats` ReEncoderConfig option to control this. + + Decoder: Decoding `json.Unmarshaler` or `lowmemjson.Decodable` + as a top-level value no longer needs to read past the closing + `"`/`]`/`}`; this can be significant when reading streaming + input, as that next read may block. + - Compatibility bugfixes: + compat/json.Valid: No longer considers truncated JSON @@ -32,6 +37,15 @@ + compat/json.Indent: Preserve trailing whitespace, same as `encoding/json`. + + compat/json.Decoder: No longer transforms "unexpected EOF" + errors to "unexpected end of JSON input". This makes it + different than `compat/json.Unmarshal`, but the same as + `encoding/json`. + + + compat/json.Decoder, compat/json.Unmarshal: No longer mutate + the target value at all if there is a syntax error in the + input. + - Unicode: + Feature: Encoder, ReEncoder: Add an `InvalidUTF8` diff --git a/compat/json/compat.go b/compat/json/compat.go index 3a9bd6c..695c1a8 100644 --- a/compat/json/compat.go +++ b/compat/json/compat.go @@ -237,14 +237,14 @@ func Valid(data []byte) bool { // Decode wrappers /////////////////////////////////////////////////// -func convertDecodeError(err error) error { +func convertDecodeError(err error, isUnmarshal bool) error { if derr, ok := err.(*lowmemjson.DecodeError); ok { switch terr := derr.Err.(type) { case *lowmemjson.DecodeSyntaxError: switch { case errors.Is(terr.Err, io.EOF): err = io.EOF - case errors.Is(terr.Err, io.ErrUnexpectedEOF): + case errors.Is(terr.Err, io.ErrUnexpectedEOF) && isUnmarshal: err = &SyntaxError{ msg: "unexpected end of JSON input", Offset: terr.Offset, @@ -284,13 +284,66 @@ func convertDecodeError(err error) error { return err } +type decodeValidator struct{} + +func (*decodeValidator) DecodeJSON(r io.RuneScanner) error { + for { + if _, _, err := r.ReadRune(); err != nil { + + if err == io.EOF { + return nil + } + return err + } + } +} + +var _ lowmemjson.Decodable = (*decodeValidator)(nil) + func Unmarshal(data []byte, ptr any) error { - return convertDecodeError(lowmemjson.NewDecoder(bytes.NewReader(data)).DecodeThenEOF(ptr)) + if err := convertDecodeError(lowmemjson.NewDecoder(bytes.NewReader(data)).DecodeThenEOF(&decodeValidator{}), true); err != nil { + return err + } + if err := convertDecodeError(lowmemjson.NewDecoder(bytes.NewReader(data)).DecodeThenEOF(ptr), true); err != nil { + return err + } + return nil +} + +type teeRuneScanner struct { + src io.RuneScanner + dst *bytes.Buffer + lastSize int +} + +func (tee *teeRuneScanner) ReadRune() (r rune, size int, err error) { + r, size, err = tee.src.ReadRune() + if err == nil { + if _, err := tee.dst.WriteRune(r); err != nil { + return 0, 0, err + } + } + + tee.lastSize = size + return +} + +func (tee *teeRuneScanner) UnreadRune() error { + if tee.lastSize == 0 { + return lowmemjson.ErrInvalidUnreadRune + } + _ = tee.src.UnreadRune() + tee.dst.Truncate(tee.dst.Len() - tee.lastSize) + tee.lastSize = 0 + return nil } type Decoder struct { + validatorBuf *bufio.Reader + validator *lowmemjson.Decoder + + decoderBuf bytes.Buffer *lowmemjson.Decoder - buf *bufio.Reader } func NewDecoder(r io.Reader) *Decoder { @@ -298,18 +351,29 @@ func NewDecoder(r io.Reader) *Decoder { if !ok { br = bufio.NewReader(r) } - return &Decoder{ - Decoder: lowmemjson.NewDecoder(br), - buf: br, + ret := &Decoder{ + validatorBuf: br, } + ret.validator = lowmemjson.NewDecoder(&teeRuneScanner{ + src: ret.validatorBuf, + dst: &ret.decoderBuf, + }) + ret.Decoder = lowmemjson.NewDecoder(&ret.decoderBuf) + return ret } func (dec *Decoder) Decode(ptr any) error { - return convertDecodeError(dec.Decoder.Decode(ptr)) + if err := convertDecodeError(dec.validator.Decode(&decodeValidator{}), false); err != nil { + return err + } + if err := convertDecodeError(dec.Decoder.Decode(ptr), false); err != nil { + return err + } + return nil } func (dec *Decoder) Buffered() io.Reader { - dat, _ := dec.buf.Peek(dec.buf.Buffered()) + dat, _ := dec.validatorBuf.Peek(dec.validatorBuf.Buffered()) return bytes.NewReader(dat) } diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index 29a8b37..df9d387 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -6,6 +6,8 @@ package json import ( "bytes" + "reflect" + "strings" "testing" "github.com/stretchr/testify/assert" @@ -161,3 +163,79 @@ func TestCompatMarshal(t *testing.T) { }) } } + +func TestCompatUnmarshal(t *testing.T) { + t.Parallel() + type testcase struct { + In string + InPtr any + ExpOut any + ExpErr string + } + testcases := map[string]testcase{ + "empty-obj": {In: `{}`, ExpOut: map[string]any{}}, + "partial-obj": {In: `{"foo":"bar",`, ExpOut: nil, ExpErr: `unexpected end of JSON input`}, + "existing-obj": {In: `{"baz":"quz"}`, InPtr: &map[string]string{"foo": "bar"}, ExpOut: map[string]string{"foo": "bar", "baz": "quz"}}, + "existing-obj-partial": {In: `{"baz":"quz"`, InPtr: &map[string]string{"foo": "bar"}, ExpOut: map[string]string{"foo": "bar"}, ExpErr: "unexpected end of JSON input"}, + "empty-ary": {In: `[]`, ExpOut: []any{}}, + "two-objs": {In: `{} {}`, ExpOut: nil, ExpErr: `invalid character '{' after top-level value`}, + "two-numbers1": {In: `00`, ExpOut: nil, ExpErr: `invalid character '0' after top-level value`}, + "two-numbers2": {In: `1 2`, ExpOut: nil, ExpErr: `invalid character '2' after top-level value`}, + } + for tcName, tc := range testcases { + tc := tc + t.Run(tcName, func(t *testing.T) { + t.Parallel() + ptr := tc.InPtr + if ptr == nil { + var out any + ptr = &out + } + err := Unmarshal([]byte(tc.In), ptr) + assert.Equal(t, tc.ExpOut, reflect.ValueOf(ptr).Elem().Interface()) + if tc.ExpErr == "" { + assert.NoError(t, err) + } else { + assert.EqualError(t, err, tc.ExpErr) + } + }) + } +} + +func TestCompatDecode(t *testing.T) { + t.Parallel() + type testcase struct { + In string + InPtr any + ExpOut any + ExpErr string + } + testcases := map[string]testcase{ + "empty-obj": {In: `{}`, ExpOut: map[string]any{}}, + "partial-obj": {In: `{"foo":"bar",`, ExpOut: nil, ExpErr: `unexpected EOF`}, + "existing-obj": {In: `{"baz":"quz"}`, InPtr: &map[string]string{"foo": "bar"}, ExpOut: map[string]string{"foo": "bar", "baz": "quz"}}, + "existing-obj-partial": {In: `{"baz":"quz"`, InPtr: &map[string]string{"foo": "bar"}, ExpOut: map[string]string{"foo": "bar"}, ExpErr: "unexpected EOF"}, + "empty-ary": {In: `[]`, ExpOut: []any{}}, + "two-objs": {In: `{} {}`, ExpOut: map[string]any{}}, + "two-numbers1": {In: `00`, ExpOut: float64(0)}, + "two-numbers2": {In: `1 2`, ExpOut: float64(1)}, + } + for tcName, tc := range testcases { + tc := tc + t.Run(tcName, func(t *testing.T) { + t.Parallel() + ptr := tc.InPtr + if ptr == nil { + var out any + ptr = &out + } + err := NewDecoder(strings.NewReader(tc.In)).Decode(ptr) + assert.Equal(t, tc.ExpOut, reflect.ValueOf(ptr).Elem().Interface()) + if tc.ExpErr == "" { + assert.NoError(t, err) + } else { + assert.EqualError(t, err, tc.ExpErr) + } + }) + } +} diff --git a/decode.go b/decode.go index 491971a..a136668 100644 --- a/decode.go +++ b/decode.go @@ -53,6 +53,8 @@ import ( // or another is encountered; if it does not, then the parent Decode // call will return a *DecodeTypeError. // +// DecodeJSON should return nil (not io.EOF) on success. +// // Implementor's note: "withLimitingScanner" is the thing to search // for in decode.go if you want to read up on that io.RuneScanner. type Decodable interface { diff --git a/decode_scan.go b/decode_scan.go index fcf47ff..63694c4 100644 --- a/decode_scan.go +++ b/decode_scan.go @@ -41,6 +41,12 @@ func (sc *runeTypeScanner) ReadRuneType() (rune, int, jsonparse.RuneType, error) case sc.repeat: sc.offset += int64(sc.rSize) _, _, _ = sc.inner.ReadRune() + case sc.parser.IsAtBarrier(): + sc.rTypeOK = true + sc.rType = jsonparse.RuneTypeEOF + sc.rRune = 0 + sc.rSize = 0 + sc.rErr = nil default: sc.rTypeOK = true again: diff --git a/internal/jsonparse/parse.go b/internal/jsonparse/parse.go index 1c35533..6432d75 100644 --- a/internal/jsonparse/parse.go +++ b/internal/jsonparse/parse.go @@ -525,6 +525,21 @@ func (par *Parser) HandleEOF() (RuneType, error) { } } +// IsAtBarrier returns whether a read-barrier has been reached and the +// next HandleRune call would definitely return RuneTypeEOF. +func (par *Parser) IsAtBarrier() bool { + return par.initialized && + // HandleRune wouldn't return early with an error. + !par.closed && + par.err == nil && + // The current (sub-)parser has reached its end, and + len(par.stack) == 0 && + // there is a barrier, and + len(par.barriers) > 0 && + // that barrier would definitely return RuneTypeEOF. + !par.barriers[len(par.barriers)-1].allowWS +} + // HandleRune feeds a Unicode rune to the Parser. // // An error is returned if and only if the RuneType is RuneTypeError. -- cgit v1.2.3