From 7a938da20e8d243bc254cd821b7cf61b379be4a6 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Wed, 15 Feb 2023 15:10:00 -0700 Subject: reencode: Rethink the UTF-8 buffer --- ReleaseNotes.md | 6 ++++ reencode.go | 95 +++++++++++++++++++++++++++++++++++++++----------------- reencode_test.go | 61 ++++++++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 29 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index e72a664..d9a671a 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -17,6 +17,12 @@ + compat/json.Compact, compat/json.Indent: Don't write to the destination buffer if there is a syntax error. + - Unicode: + + + Bugfix: Encoder, ReEncoder: Fix an issue with decoding UTF-8 + that when a codepoint straddles a write boundary it is + interpreted as a sequence of U+FFFD runes. + # v0.3.6 (2023-02-16) Theme: Architectural improvements diff --git a/reencode.go b/reencode.go index 0745c43..a33cc8f 100644 --- a/reencode.go +++ b/reencode.go @@ -169,6 +169,46 @@ var ( _ io.Closer = (*ReEncoder)(nil) ) +func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full bool) { + if pos < enc.bufLen { + var tmp [utf8.UTFMax]byte + n := copy(tmp[:], enc.buf[pos:enc.bufLen]) + n += copy(tmp[n:], str) + c, size := utf8.DecodeRune(tmp[:n]) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRune(tmp[:n]) + } + return c, size, true + } else { + tmp := str[pos-enc.bufLen:] + c, size := utf8.DecodeRune(tmp) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRune(tmp) + } + return c, size, true + } +} + +func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full bool) { + if pos < enc.bufLen { + var tmp [utf8.UTFMax]byte + n := copy(tmp[:], enc.buf[pos:enc.bufLen]) + n += copy(tmp[n:], str) + c, size := utf8.DecodeRune(tmp[:n]) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRune(tmp[:n]) + } + return c, size, true + } else { + tmp := str[pos-enc.bufLen:] + c, size := utf8.DecodeRuneInString(tmp) + if c == utf8.RuneError && size <= 1 { + return c, size, utf8.FullRuneInString(tmp) + } + return c, size, true + } +} + // Write implements io.Writer; it does what you'd expect. // // It is worth noting that Write returns the number of bytes consumed @@ -177,59 +217,56 @@ var ( // but *ReEncoder does because it transforms the data written to it, // and the number of bytes written may be wildly different than the // number of bytes handled. -func (enc *ReEncoder) Write(p []byte) (int, error) { - if len(p) == 0 { +func (enc *ReEncoder) Write(str []byte) (int, error) { + if len(str) == 0 { return 0, nil } var n int - if enc.bufLen > 0 { - copy(enc.buf[enc.bufLen:], p) - c, size := utf8.DecodeRune(enc.buf[:]) - n += size - enc.bufLen - enc.bufLen = 0 - enc.handleRune(c, size) - if enc.err != nil { - return 0, enc.err + for { + c, size, full := enc.getRuneFromBytes(str, n) + if !full { + if n < enc.bufLen { + l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) + l += copy(enc.buf[l:], str) + enc.bufLen = l + } else { + enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:]) + } + return len(str), nil } - } - for utf8.FullRune(p[n:]) { - c, size := utf8.DecodeRune(p[n:]) enc.handleRune(c, size) if enc.err != nil { return n, enc.err } n += size } - enc.bufLen = copy(enc.buf[:], p[n:]) - return len(p), nil } // WriteString implements io.StringWriter; it does what you'd expect, // but see the notes on the Write method. -func (enc *ReEncoder) WriteString(p string) (int, error) { - if len(p) == 0 { +func (enc *ReEncoder) WriteString(str string) (int, error) { + if len(str) == 0 { return 0, nil } var n int - if enc.bufLen > 0 { - copy(enc.buf[enc.bufLen:], p) - c, size := utf8.DecodeRune(enc.buf[:]) - n += size - enc.bufLen - enc.bufLen = 0 - enc.handleRune(c, size) - if enc.err != nil { - return 0, enc.err + for { + c, size, full := enc.getRuneFromString(str, n) + if !full { + if n < enc.bufLen { + l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) + l += copy(enc.buf[l:], str) + enc.bufLen = l + } else { + enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:]) + } + return len(str), nil } - } - for utf8.FullRuneInString(p[n:]) { - c, size := utf8.DecodeRuneInString(p[n:]) enc.handleRune(c, size) if enc.err != nil { return n, enc.err } n += size } - return len(p), nil } // WriteByte implements io.ByteWriter; it does what you'd expect. diff --git a/reencode_test.go b/reencode_test.go index f135aa5..82a1861 100644 --- a/reencode_test.go +++ b/reencode_test.go @@ -161,3 +161,64 @@ func TestReEncode(t *testing.T) { }) } } + +func TestReEncodeWriteSize(t *testing.T) { + t.Parallel() + + multibyteRune := `😂` + assert.Len(t, multibyteRune, 4) + + input := `"` + multibyteRune + `"` + + t.Run("bytes-bigwrite", func(t *testing.T) { + t.Parallel() + var out strings.Builder + enc := NewReEncoder(&out, ReEncoderConfig{}) + + n, err := enc.Write([]byte(input)) + assert.NoError(t, err) + assert.Equal(t, len(input), n) + + assert.Equal(t, input, out.String()) + }) + t.Run("string-bigwrite", func(t *testing.T) { + t.Parallel() + var out strings.Builder + enc := NewReEncoder(&out, ReEncoderConfig{}) + + n, err := enc.WriteString(input) + assert.NoError(t, err) + assert.Equal(t, len(input), n) + + assert.Equal(t, input, out.String()) + }) + + t.Run("bytes-smallwrites", func(t *testing.T) { + t.Parallel() + var out strings.Builder + enc := NewReEncoder(&out, ReEncoderConfig{}) + + var buf [1]byte + for i := 0; i < len(input); i++ { + buf[0] = input[i] + n, err := enc.Write(buf[:]) + assert.NoError(t, err) + assert.Equal(t, 1, n) + } + + assert.Equal(t, input, out.String()) + }) + t.Run("string-smallwrites", func(t *testing.T) { + t.Parallel() + var out strings.Builder + enc := NewReEncoder(&out, ReEncoderConfig{}) + + for i := 0; i < len(input); i++ { + n, err := enc.WriteString(input[i : i+1]) + assert.NoError(t, err) + assert.Equal(t, 1, n) + } + + assert.Equal(t, input, out.String()) + }) +} -- cgit v1.2.3-54-g00ecf