summaryrefslogtreecommitdiff
path: root/reencode.go
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2023-02-15 15:10:00 -0700
committerLuke Shumaker <lukeshu@lukeshu.com>2023-02-18 22:45:54 -0700
commit7a938da20e8d243bc254cd821b7cf61b379be4a6 (patch)
treefa4844592ff59b0cdce06fcd758afa306725f2bb /reencode.go
parent49319198500729fd65bd6d69071f45f2d7ae2aa7 (diff)
reencode: Rethink the UTF-8 buffer
Diffstat (limited to 'reencode.go')
-rw-r--r--reencode.go95
1 files changed, 66 insertions, 29 deletions
diff --git a/reencode.go b/reencode.go
index 0745c43..a33cc8f 100644
--- a/reencode.go
+++ b/reencode.go
@@ -169,6 +169,46 @@ var (
_ io.Closer = (*ReEncoder)(nil)
)
+func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full bool) {
+ if pos < enc.bufLen {
+ var tmp [utf8.UTFMax]byte
+ n := copy(tmp[:], enc.buf[pos:enc.bufLen])
+ n += copy(tmp[n:], str)
+ c, size := utf8.DecodeRune(tmp[:n])
+ if c == utf8.RuneError && size <= 1 {
+ return c, size, utf8.FullRune(tmp[:n])
+ }
+ return c, size, true
+ } else {
+ tmp := str[pos-enc.bufLen:]
+ c, size := utf8.DecodeRune(tmp)
+ if c == utf8.RuneError && size <= 1 {
+ return c, size, utf8.FullRune(tmp)
+ }
+ return c, size, true
+ }
+}
+
+func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full bool) {
+ if pos < enc.bufLen {
+ var tmp [utf8.UTFMax]byte
+ n := copy(tmp[:], enc.buf[pos:enc.bufLen])
+ n += copy(tmp[n:], str)
+ c, size := utf8.DecodeRune(tmp[:n])
+ if c == utf8.RuneError && size <= 1 {
+ return c, size, utf8.FullRune(tmp[:n])
+ }
+ return c, size, true
+ } else {
+ tmp := str[pos-enc.bufLen:]
+ c, size := utf8.DecodeRuneInString(tmp)
+ if c == utf8.RuneError && size <= 1 {
+ return c, size, utf8.FullRuneInString(tmp)
+ }
+ return c, size, true
+ }
+}
+
// Write implements io.Writer; it does what you'd expect.
//
// It is worth noting that Write returns the number of bytes consumed
@@ -177,59 +217,56 @@ var (
// but *ReEncoder does because it transforms the data written to it,
// and the number of bytes written may be wildly different than the
// number of bytes handled.
-func (enc *ReEncoder) Write(p []byte) (int, error) {
- if len(p) == 0 {
+func (enc *ReEncoder) Write(str []byte) (int, error) {
+ if len(str) == 0 {
return 0, nil
}
var n int
- if enc.bufLen > 0 {
- copy(enc.buf[enc.bufLen:], p)
- c, size := utf8.DecodeRune(enc.buf[:])
- n += size - enc.bufLen
- enc.bufLen = 0
- enc.handleRune(c, size)
- if enc.err != nil {
- return 0, enc.err
+ for {
+ c, size, full := enc.getRuneFromBytes(str, n)
+ if !full {
+ if n < enc.bufLen {
+ l := copy(enc.buf[:], enc.buf[n:enc.bufLen])
+ l += copy(enc.buf[l:], str)
+ enc.bufLen = l
+ } else {
+ enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:])
+ }
+ return len(str), nil
}
- }
- for utf8.FullRune(p[n:]) {
- c, size := utf8.DecodeRune(p[n:])
enc.handleRune(c, size)
if enc.err != nil {
return n, enc.err
}
n += size
}
- enc.bufLen = copy(enc.buf[:], p[n:])
- return len(p), nil
}
// WriteString implements io.StringWriter; it does what you'd expect,
// but see the notes on the Write method.
-func (enc *ReEncoder) WriteString(p string) (int, error) {
- if len(p) == 0 {
+func (enc *ReEncoder) WriteString(str string) (int, error) {
+ if len(str) == 0 {
return 0, nil
}
var n int
- if enc.bufLen > 0 {
- copy(enc.buf[enc.bufLen:], p)
- c, size := utf8.DecodeRune(enc.buf[:])
- n += size - enc.bufLen
- enc.bufLen = 0
- enc.handleRune(c, size)
- if enc.err != nil {
- return 0, enc.err
+ for {
+ c, size, full := enc.getRuneFromString(str, n)
+ if !full {
+ if n < enc.bufLen {
+ l := copy(enc.buf[:], enc.buf[n:enc.bufLen])
+ l += copy(enc.buf[l:], str)
+ enc.bufLen = l
+ } else {
+ enc.bufLen = copy(enc.buf[:], str[n-enc.bufLen:])
+ }
+ return len(str), nil
}
- }
- for utf8.FullRuneInString(p[n:]) {
- c, size := utf8.DecodeRuneInString(p[n:])
enc.handleRune(c, size)
if enc.err != nil {
return n, enc.err
}
n += size
}
- return len(p), nil
}
// WriteByte implements io.ByteWriter; it does what you'd expect.