From 218edcc3878394a6942d4f72e3be99137c22825a Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Tue, 14 Feb 2023 14:27:18 -0700 Subject: reencode: Fix trimming trailing zeros --- reencode_test.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'reencode_test.go') diff --git a/reencode_test.go b/reencode_test.go index 83660ef..f135aa5 100644 --- a/reencode_test.go +++ b/reencode_test.go @@ -131,6 +131,24 @@ func TestReEncode(t *testing.T) { โ€”ยป9 โ€”]`, }, + "numbers": { + enc: ReEncoderConfig{ + Compact: true, + }, + in: []any{ + Number("1.200e003"), + }, + exp: `[1.2e3]`, + }, + "numbers-zero": { + enc: ReEncoderConfig{ + Compact: true, + }, + in: []any{ + Number("1.000e000"), + }, + exp: `[1.0e0]`, + }, } for tcName, tc := range testcases { tc := tc -- cgit v1.2.3-54-g00ecf From 7a938da20e8d243bc254cd821b7cf61b379be4a6 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Wed, 15 Feb 2023 15:10:00 -0700 Subject: reencode: Rethink the UTF-8 buffer --- ReleaseNotes.md | 6 ++++ reencode.go | 95 +++++++++++++++++++++++++++++++++++++++----------------- reencode_test.go | 61 ++++++++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 29 deletions(-) (limited to 'reencode_test.go') diff --git a/ReleaseNotes.md b/ReleaseNotes.md index e72a664..d9a671a 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -17,6 +17,12 @@ + compat/json.Compact, compat/json.Indent: Don't write to the destination buffer if there is a syntax error. + - Unicode: + + + Bugfix: Encoder, ReEncoder: Fix an issue with decoding UTF-8 + that when a codepoint straddles a write boundary it is + interpreted as a sequence of U+FFFD runes. + # v0.3.6 (2023-02-16) Theme: Architectural improvements diff --git a/reencode.go b/reencode.go index 0745c43..a33cc8f 100644 --- a/reencode.go +++ b/reencode.go @@ -169,6 +169,46 @@ var ( _ io.Closer = (*ReEncoder)(nil) ) +func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full bool) { + if pos < enc.bufLen { + var tmp [utf8.UTFMax]byte + n := copy(tmp[:], enc.buf[pos:enc.bufLen]) + n += copy(tmp[n:], str) + c, size := utf8.DecodeRune(tmp[:n]) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRune(tmp[:n]) + } + return c, size, true + } else { + tmp := str[pos-enc.bufLen:] + c, size := utf8.DecodeRune(tmp) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRune(tmp) + } + return c, size, true + } +} + +func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full bool) { + if pos < enc.bufLen { + var tmp [utf8.UTFMax]byte + n := copy(tmp[:], enc.buf[pos:enc.bufLen]) + n += copy(tmp[n:], str) + c, size := utf8.DecodeRune(tmp[:n]) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRune(tmp[:n]) + } + return c, size, true + } else { + tmp := str[pos-enc.bufLen:] + c, size := utf8.DecodeRuneInString(tmp) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRuneInString(tmp) + } + return c, size, true + } +} + // Write implements io.Writer; it does what you'd expect. // // It is worth noting that Write returns the number of bytes consumed @@ -177,59 +217,56 @@ var ( // but *ReEncoder does because it transforms the data written to it, // and the number of bytes written may be wildly different than the // number of bytes handled. -func (enc *ReEncoder) Write(p []byte) (int, error) { - if len(p) == 0 { +func (enc *ReEncoder) Write(str []byte) (int, error) { + if len(str) == 0 { return 0, nil } var n int - if enc.bufLen > 0 { - copy(enc.buf[enc.bufLen:], p) - c, size := utf8.DecodeRune(enc.buf[:]) - n += size - enc.bufLen - enc.bufLen = 0 - enc.handleRune(c, size) - if enc.err != nil { - return 0, enc.err + for { + c, size, full := enc.getRuneFromBytes(str, n) + if !full { + if n < enc.bufLen { + l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) + l += copy(enc.buf[l:], str) + enc.bufLen = l + } else { + enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:]) + } + return len(str), nil } - } - for utf8.FullRune(p[n:]) { - c, size := utf8.DecodeRune(p[n:]) enc.handleRune(c, size) if enc.err != nil { return n, enc.err } n += size } - enc.bufLen = copy(enc.buf[:], p[n:]) - return len(p), nil } // WriteString implements io.StringWriter; it does what you'd expect, // but see the notes on the Write method. -func (enc *ReEncoder) WriteString(p string) (int, error) { - if len(p) == 0 { +func (enc *ReEncoder) WriteString(str string) (int, error) { + if len(str) == 0 { return 0, nil } var n int - if enc.bufLen > 0 { - copy(enc.buf[enc.bufLen:], p) - c, size := utf8.DecodeRune(enc.buf[:]) - n += size - enc.bufLen - enc.bufLen = 0 - enc.handleRune(c, size) - if enc.err != nil { - return 0, enc.err + for { + c, size, full := enc.getRuneFromString(str, n) + if !full { + if n < enc.bufLen { + l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) + l += copy(enc.buf[l:], str) + enc.bufLen = l + } else { + enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:]) + } + return len(str), nil } - } - for utf8.FullRuneInString(p[n:]) { - c, size := utf8.DecodeRuneInString(p[n:]) enc.handleRune(c, size) if enc.err != nil { return n, enc.err } n += size } - return len(p), nil } // WriteByte implements io.ByteWriter; it does what you'd expect. diff --git a/reencode_test.go b/reencode_test.go index f135aa5..82a1861 100644 --- a/reencode_test.go +++ b/reencode_test.go @@ -161,3 +161,64 @@ func TestReEncode(t *testing.T) { }) } } + +func TestReEncodeWriteSize(t *testing.T) { + t.Parallel() + + multibyteRune := `๐Ÿ˜‚` + assert.Len(t, multibyteRune, 4) + + input := `"` + multibyteRune + `"` + + t.Run("bytes-bigwrite", func(t *testing.T) { + t.Parallel() + var out strings.Builder + enc := NewReEncoder(&out, ReEncoderConfig{}) + + n, err := enc.Write([]byte(input)) + assert.NoError(t, err) + assert.Equal(t, len(input), n) + + assert.Equal(t, input, out.String()) + }) + t.Run("string-bigwrite", func(t *testing.T) { + t.Parallel() + var out strings.Builder + enc := NewReEncoder(&out, ReEncoderConfig{}) + + n, err := enc.WriteString(input) + assert.NoError(t, err) + assert.Equal(t, len(input), n) + + assert.Equal(t, input, out.String()) + }) + + t.Run("bytes-smallwrites", func(t *testing.T) { + t.Parallel() + var out strings.Builder + enc := NewReEncoder(&out, ReEncoderConfig{}) + + var buf [1]byte + for i := 0; i < len(input); i++ { + buf[0] = input[i] + n, err := enc.Write(buf[:]) + assert.NoError(t, err) + assert.Equal(t, 1, n) + } + + assert.Equal(t, input, out.String()) + }) + t.Run("string-smallwrites", func(t *testing.T) { + t.Parallel() + var out strings.Builder + enc := NewReEncoder(&out, ReEncoderConfig{}) + + for i := 0; i < len(input); i++ { + n, err := enc.WriteString(input[i : i+1]) + assert.NoError(t, err) + assert.Equal(t, 1, n) + } + + assert.Equal(t, input, out.String()) + }) +} -- cgit v1.2.3-54-g00ecf From 38989a9c4f69abfe04c3eb4ec3382be88802141c Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Wed, 15 Feb 2023 23:53:59 -0700 Subject: reencode: Fix .stackSize --- reencode.go | 4 ++-- reencode_test.go | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'reencode_test.go') diff --git a/reencode.go b/reencode.go index a33cc8f..fd848f8 100644 --- a/reencode.go +++ b/reencode.go @@ -364,8 +364,8 @@ func (enc *ReEncoder) popWriteBarrier() { func (enc *ReEncoder) stackSize() int { sz := enc.par.StackSize() - for _, barrier := range enc.barriers { - sz += barrier.stackSize + if len(enc.barriers) > 0 { + sz += enc.barriers[len(enc.barriers)-1].stackSize } return sz } diff --git a/reencode_test.go b/reencode_test.go index 82a1861..bc6d246 100644 --- a/reencode_test.go +++ b/reencode_test.go @@ -9,6 +9,8 @@ import ( "testing" "github.com/stretchr/testify/assert" + + "git.lukeshu.com/go/lowmemjson/internal/fastio" ) func TestReEncode(t *testing.T) { @@ -222,3 +224,17 @@ func TestReEncodeWriteSize(t *testing.T) { assert.Equal(t, input, out.String()) }) } + +func TestReEncoderStackSize(t *testing.T) { + t.Parallel() + + enc := NewReEncoder(fastio.Discard, ReEncoderConfig{}) + assert.Equal(t, 0, enc.stackSize()) + + for i := 0; i < 5; i++ { + assert.NoError(t, enc.WriteByte('[')) + assert.Equal(t, i+1, enc.stackSize()) + enc.pushWriteBarrier() + assert.Equal(t, i+2, enc.stackSize()) + } +} -- cgit v1.2.3-54-g00ecf From 1a5b0561f53441d8a259a5096281699b5af16a6c Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Thu, 16 Feb 2023 16:53:53 -0700 Subject: reencode: Add CompactFloats --- ReleaseNotes.md | 9 ++++++++- compat/json/compat_test.go | 3 +++ reencode.go | 10 ++++++++-- reencode_test.go | 6 ++++-- 4 files changed, 23 insertions(+), 5 deletions(-) (limited to 'reencode_test.go') diff --git a/ReleaseNotes.md b/ReleaseNotes.md index b1647da..ae147b1 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -4,11 +4,15 @@ User-facing changes: - - General bugfixes: + - General Changes: + Encoder, ReEncoder: Now correctly trims unnecessary the trailing '0's from the fraction-part when compacting numbers. + + ReEncoder: No longer compact floating-point numbers by + default, add a `CompactFloats` ReEncoderConfig option to + control this. + - Compatibility bugfixes: + compat/json.Valid: No longer considers truncated JSON @@ -17,6 +21,9 @@ + compat/json.Compact, compat/json.Indent: Don't write to the destination buffer if there is a syntax error. + + compat/json.Compact, compat/json.Indent: No longer compact + floating-point numbers; as `encoding/json` doesn't. + - Unicode: + Feature: Encoder, ReEncoder: Add an `InvalidUTF8` diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index d989a4d..128bd1b 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -46,6 +46,7 @@ func TestCompatCompact(t *testing.T) { "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, "object": {In: `{}`, Out: `{}`}, "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, + "float": {In: `1.200e003`, Out: `1.200e003`}, } for tcName, tc := range testcases { tc := tc @@ -75,6 +76,7 @@ func TestCompatIndent(t *testing.T) { "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, "object": {In: `{}`, Out: `{}`}, "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, + "float": {In: `1.200e003`, Out: `1.200e003`}, } for tcName, tc := range testcases { tc := tc @@ -103,6 +105,7 @@ func TestCompatMarshal(t *testing.T) { testcases := map[string]testcase{ "non-utf8": {In: "\x85\xcd", Out: "\"\\ufffd\\ufffd\""}, "urc": {In: "\ufffd", Out: "\"\ufffd\""}, + "float": {In: 1.2e3, Out: `1200`}, } for tcName, tc := range testcases { tc := tc diff --git a/reencode.go b/reencode.go index 1a9999b..1943b9c 100644 --- a/reencode.go +++ b/reencode.go @@ -54,6 +54,10 @@ type ReEncoderConfig struct { // this is different than the usual behavior. ForceTrailingNewlines bool + // CompactFloats causes the *ReEncoder to trim unnecessary '0' + // digits from floating-point number values. + CompactFloats bool + // A JSON document is specified to be a sequence of Unicode // codepoints; InvalidUTF8 controls how the *ReEncoder behaves // when it encounters invalid UTF-8 bytes in a JSON string @@ -109,8 +113,10 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder { } // Numbers - module = &reEncodeCompactNum{ - out: module, + if cfg.CompactFloats { + module = &reEncodeCompactNum{ + out: module, + } } // Strings diff --git a/reencode_test.go b/reencode_test.go index bc6d246..715e976 100644 --- a/reencode_test.go +++ b/reencode_test.go @@ -135,7 +135,8 @@ func TestReEncode(t *testing.T) { }, "numbers": { enc: ReEncoderConfig{ - Compact: true, + Compact: true, + CompactFloats: true, }, in: []any{ Number("1.200e003"), @@ -144,7 +145,8 @@ func TestReEncode(t *testing.T) { }, "numbers-zero": { enc: ReEncoderConfig{ - Compact: true, + Compact: true, + CompactFloats: true, }, in: []any{ Number("1.000e000"), -- cgit v1.2.3-54-g00ecf