summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2023-02-15 15:10:00 -0700
committerLuke Shumaker <lukeshu@lukeshu.com>2023-02-18 22:45:54 -0700
commit7a938da20e8d243bc254cd821b7cf61b379be4a6 (patch)
treefa4844592ff59b0cdce06fcd758afa306725f2bb
parent49319198500729fd65bd6d69071f45f2d7ae2aa7 (diff)
reencode: Rethink the UTF-8 buffer
-rw-r--r--ReleaseNotes.md6
-rw-r--r--reencode.go95
-rw-r--r--reencode_test.go61
3 files changed, 133 insertions, 29 deletions
diff --git a/ReleaseNotes.md b/ReleaseNotes.md
index e72a664..d9a671a 100644
--- a/ReleaseNotes.md
+++ b/ReleaseNotes.md
@@ -17,6 +17,12 @@
+ compat/json.Compact, compat/json.Indent: Don't write to the
destination buffer if there is a syntax error.
+ - Unicode:
+
+ + Bugfix: Encoder, ReEncoder: Fix an issue with decoding UTF-8
+ that when a codepoint straddles a write boundary it is
+ interpreted as a sequence of U+FFFD runes.
+
# v0.3.6 (2023-02-16)
Theme: Architectural improvements
diff --git a/reencode.go b/reencode.go
index 0745c43..a33cc8f 100644
--- a/reencode.go
+++ b/reencode.go
@@ -169,6 +169,46 @@ var (
_ io.Closer = (*ReEncoder)(nil)
)
+func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full bool) {
+ if pos < enc.bufLen {
+ var tmp [utf8.UTFMax]byte
+ n := copy(tmp[:], enc.buf[pos:enc.bufLen])
+ n += copy(tmp[n:], str)
+ c, size := utf8.DecodeRune(tmp[:n])
+ if c == utf8.RuneError && size <= 1 {
+ return c, size, utf8.FullRune(tmp[:n])
+ }
+ return c, size, true
+ } else {
+ tmp := str[pos-enc.bufLen:]
+ c, size := utf8.DecodeRune(tmp)
+ if c == utf8.RuneError && size <= 1 {
+ return c, size, utf8.FullRune(tmp)
+ }
+ return c, size, true
+ }
+}
+
+func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full bool) {
+ if pos < enc.bufLen {
+ var tmp [utf8.UTFMax]byte
+ n := copy(tmp[:], enc.buf[pos:enc.bufLen])
+ n += copy(tmp[n:], str)
+ c, size := utf8.DecodeRune(tmp[:n])
+ if c == utf8.RuneError && size <= 1 {
+ return c, size, utf8.FullRune(tmp[:n])
+ }
+ return c, size, true
+ } else {
+ tmp := str[pos-enc.bufLen:]
+ c, size := utf8.DecodeRuneInString(tmp)
+ if c == utf8.RuneError && size <= 1 {
+ return c, size, utf8.FullRuneInString(tmp)
+ }
+ return c, size, true
+ }
+}
+
// Write implements io.Writer; it does what you'd expect.
//
// It is worth noting that Write returns the number of bytes consumed
@@ -177,59 +217,56 @@ var (
// but *ReEncoder does because it transforms the data written to it,
// and the number of bytes written may be wildly different than the
// number of bytes handled.
-func (enc *ReEncoder) Write(p []byte) (int, error) {
- if len(p) == 0 {
+func (enc *ReEncoder) Write(str []byte) (int, error) {
+ if len(str) == 0 {
return 0, nil
}
var n int
- if enc.bufLen > 0 {
- copy(enc.buf[enc.bufLen:], p)
- c, size := utf8.DecodeRune(enc.buf[:])
- n += size - enc.bufLen
- enc.bufLen = 0
- enc.handleRune(c, size)
- if enc.err != nil {
- return 0, enc.err
+ for {
+ c, size, full := enc.getRuneFromBytes(str, n)
+ if !full {
+ if n < enc.bufLen {
+ l := copy(enc.buf[:], enc.buf[n:enc.bufLen])
+ l += copy(enc.buf[l:], str)
+ enc.bufLen = l
+ } else {
+ enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:])
+ }
+ return len(str), nil
}
- }
- for utf8.FullRune(p[n:]) {
- c, size := utf8.DecodeRune(p[n:])
enc.handleRune(c, size)
if enc.err != nil {
return n, enc.err
}
n += size
}
- enc.bufLen = copy(enc.buf[:], p[n:])
- return len(p), nil
}
// WriteString implements io.StringWriter; it does what you'd expect,
// but see the notes on the Write method.
-func (enc *ReEncoder) WriteString(p string) (int, error) {
- if len(p) == 0 {
+func (enc *ReEncoder) WriteString(str string) (int, error) {
+ if len(str) == 0 {
return 0, nil
}
var n int
- if enc.bufLen > 0 {
- copy(enc.buf[enc.bufLen:], p)
- c, size := utf8.DecodeRune(enc.buf[:])
- n += size - enc.bufLen
- enc.bufLen = 0
- enc.handleRune(c, size)
- if enc.err != nil {
- return 0, enc.err
+ for {
+ c, size, full := enc.getRuneFromString(str, n)
+ if !full {
+ if n < enc.bufLen {
+ l := copy(enc.buf[:], enc.buf[n:enc.bufLen])
+ l += copy(enc.buf[l:], str)
+ enc.bufLen = l
+ } else {
+ enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:])
+ }
+ return len(str), nil
}
- }
- for utf8.FullRuneInString(p[n:]) {
- c, size := utf8.DecodeRuneInString(p[n:])
enc.handleRune(c, size)
if enc.err != nil {
return n, enc.err
}
n += size
}
- return len(p), nil
}
// WriteByte implements io.ByteWriter; it does what you'd expect.
diff --git a/reencode_test.go b/reencode_test.go
index f135aa5..82a1861 100644
--- a/reencode_test.go
+++ b/reencode_test.go
@@ -161,3 +161,64 @@ func TestReEncode(t *testing.T) {
})
}
}
+
+func TestReEncodeWriteSize(t *testing.T) {
+ t.Parallel()
+
+ multibyteRune := `😂`
+ assert.Len(t, multibyteRune, 4)
+
+ input := `"` + multibyteRune + `"`
+
+ t.Run("bytes-bigwrite", func(t *testing.T) {
+ t.Parallel()
+ var out strings.Builder
+ enc := NewReEncoder(&out, ReEncoderConfig{})
+
+ n, err := enc.Write([]byte(input))
+ assert.NoError(t, err)
+ assert.Equal(t, len(input), n)
+
+ assert.Equal(t, input, out.String())
+ })
+ t.Run("string-bigwrite", func(t *testing.T) {
+ t.Parallel()
+ var out strings.Builder
+ enc := NewReEncoder(&out, ReEncoderConfig{})
+
+ n, err := enc.WriteString(input)
+ assert.NoError(t, err)
+ assert.Equal(t, len(input), n)
+
+ assert.Equal(t, input, out.String())
+ })
+
+ t.Run("bytes-smallwrites", func(t *testing.T) {
+ t.Parallel()
+ var out strings.Builder
+ enc := NewReEncoder(&out, ReEncoderConfig{})
+
+ var buf [1]byte
+ for i := 0; i < len(input); i++ {
+ buf[0] = input[i]
+ n, err := enc.Write(buf[:])
+ assert.NoError(t, err)
+ assert.Equal(t, 1, n)
+ }
+
+ assert.Equal(t, input, out.String())
+ })
+ t.Run("string-smallwrites", func(t *testing.T) {
+ t.Parallel()
+ var out strings.Builder
+ enc := NewReEncoder(&out, ReEncoderConfig{})
+
+ for i := 0; i < len(input); i++ {
+ n, err := enc.WriteString(input[i : i+1])
+ assert.NoError(t, err)
+ assert.Equal(t, 1, n)
+ }
+
+ assert.Equal(t, input, out.String())
+ })
+}