From dfc67cecbd95344d296c31b537fa3ae8aec8c292 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Tue, 14 Feb 2023 22:36:25 -0700 Subject: encode, reencode: Fix handling of invalid UTF-8 --- ReleaseNotes.md | 24 ++++++++ compat/json/compat.go | 5 +- compat/json/compat_test.go | 45 ++++++++++++--- compat/json/testcompat_test.go | 5 +- encode.go | 34 +++++------ encode_escape.go | 37 +++++++++--- errors.go | 3 +- internal/jsonstring/encode_string.go | 65 +++++++++++++++++++-- reencode.go | 107 +++++++++++++++++++++++------------ 9 files changed, 247 insertions(+), 78 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index d9a671a..b1647da 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -19,10 +19,34 @@ - Unicode: + + Feature: Encoder, ReEncoder: Add an `InvalidUTF8` + ReEncoderConfig option and `BackslashEscapeRawByte` + BackslashEscapeMode to allow emitted strings to contain + invalid UTF-8. + + + Change: EscapeDefault, EscapeDefaultNonHTMLSafe: No longer + force long Unicode `\uXXXX` sequences for the U+FFFD Unicode + replacement character. + + + Change: Encoder: Unless overridden by the BackslashEscaper, + now by default uses `\uXXXX` sequences when emitting the + U+FFFD Unicode replacement character in place of invalid + UTF-8. + + Bugfix: Encoder, ReEncoder: Fix an issue with decoding UTF-8 that when a codepoint straddles a write boundary it is interpreted as a sequence of U+FFFD runes. + + Bugfix: compat/json.Valid: Do not consider JSON containing + invalid UTF-8 to be valid (this is different than + `encoding/json` at the time of this writing; but I consider + that to be a bug in `encoding/json`; [go#58517][]). + + + Bugfix: compat/json.Compact, compat/json.Indent: Don't munge + invalid UTF-8 in strings; as `encoding/json` doesn't. + + [go#58517]: https://github.com/golang/go/issues/58517 + # v0.3.6 (2023-02-16) Theme: Architectural improvements diff --git a/compat/json/compat.go b/compat/json/compat.go index 1cdbf0b..d326514 100644 --- a/compat/json/compat.go +++ b/compat/json/compat.go @@ -160,6 +160,7 @@ func Compact(dst *bytes.Buffer, src []byte) error { start := dst.Len() err := reencode(dst, src, lowmemjson.ReEncoderConfig{ Compact: true, + InvalidUTF8: lowmemjson.InvalidUTF8Preserve, BackslashEscape: lowmemjson.EscapePreserve, }) if err != nil { @@ -173,6 +174,7 @@ func Indent(dst *bytes.Buffer, src []byte, prefix, indent string) error { err := reencode(dst, src, lowmemjson.ReEncoderConfig{ Indent: indent, Prefix: prefix, + InvalidUTF8: lowmemjson.InvalidUTF8Preserve, BackslashEscape: lowmemjson.EscapePreserve, }) if err != nil { @@ -183,7 +185,8 @@ func Indent(dst *bytes.Buffer, src []byte, prefix, indent string) error { func Valid(data []byte) bool { formatter := lowmemjson.NewReEncoder(io.Discard, lowmemjson.ReEncoderConfig{ - Compact: true, + Compact: true, + InvalidUTF8: lowmemjson.InvalidUTF8Error, }) if _, err := formatter.Write(data); err != nil { return false diff --git a/compat/json/compat_test.go b/compat/json/compat_test.go index d513c27..d989a4d 100644 --- a/compat/json/compat_test.go +++ b/compat/json/compat_test.go @@ -18,10 +18,11 @@ func TestCompatValid(t *testing.T) { Exp bool } testcases := map[string]testcase{ - "empty": {In: ``, Exp: false}, - "num": {In: `1`, Exp: true}, - "trunc": {In: `{`, Exp: false}, - "object": {In: `{}`, Exp: true}, + "empty": {In: ``, Exp: false}, + "num": {In: `1`, Exp: true}, + "trunc": {In: `{`, Exp: false}, + "object": {In: `{}`, Exp: true}, + "non-utf8": {In: "\"\x85\xcd\"", Exp: false}, // https://github.com/golang/go/issues/58517 } for tcName, tc := range testcases { tc := tc @@ -42,8 +43,9 @@ func TestCompatCompact(t *testing.T) { Err string } testcases := map[string]testcase{ - "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, - "object": {In: `{}`, Out: `{}`}, + "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, + "object": {In: `{}`, Out: `{}`}, + "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, } for tcName, tc := range testcases { tc := tc @@ -70,8 +72,9 @@ func TestCompatIndent(t *testing.T) { Err string } testcases := map[string]testcase{ - "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, - "object": {In: `{}`, Out: `{}`}, + "trunc": {In: `{`, Out: ``, Err: `unexpected end of JSON input`}, + "object": {In: `{}`, Out: `{}`}, + "non-utf8": {In: "\"\x85\xcd\"", Out: "\"\x85\xcd\""}, } for tcName, tc := range testcases { tc := tc @@ -89,3 +92,29 @@ func TestCompatIndent(t *testing.T) { }) } } + +func TestCompatMarshal(t *testing.T) { + t.Parallel() + type testcase struct { + In any + Out string + Err string + } + testcases := map[string]testcase{ + "non-utf8": {In: "\x85\xcd", Out: "\"\\ufffd\\ufffd\""}, + "urc": {In: "\ufffd", Out: "\"\ufffd\""}, + } + for tcName, tc := range testcases { + tc := tc + t.Run(tcName, func(t *testing.T) { + t.Parallel() + out, err := Marshal(tc.In) + assert.Equal(t, tc.Out, string(out)) + if tc.Err == "" { + assert.NoError(t, err) + } else { + assert.EqualError(t, err, tc.Err) + } + }) + } +} diff --git a/compat/json/testcompat_test.go b/compat/json/testcompat_test.go index 42cbf5c..e89b4b4 100644 --- a/compat/json/testcompat_test.go +++ b/compat/json/testcompat_test.go @@ -8,6 +8,7 @@ import ( "bytes" "encoding/json" "io" + "reflect" _ "unsafe" "git.lukeshu.com/go/lowmemjson" @@ -59,13 +60,13 @@ type encodeState struct { } func (es *encodeState) string(str string, _ bool) { - if err := jsonstring.EncodeStringFromString(&es.Buffer, lowmemjson.EscapeDefault, str); err != nil { + if err := jsonstring.EncodeStringFromString(&es.Buffer, lowmemjson.EscapeDefault, 0, reflect.Value{}, str); err != nil { panic(err) } } func (es *encodeState) stringBytes(str []byte, _ bool) { - if err := jsonstring.EncodeStringFromBytes(&es.Buffer, lowmemjson.EscapeDefault, str); err != nil { + if err := jsonstring.EncodeStringFromBytes(&es.Buffer, lowmemjson.EscapeDefault, 0, reflect.Value{}, str); err != nil { panic(err) } } diff --git a/encode.go b/encode.go index 00d3dad..684cc75 100644 --- a/encode.go +++ b/encode.go @@ -87,7 +87,7 @@ func (enc *Encoder) Encode(obj any) (err error) { if escaper == nil { escaper = EscapeDefault } - if err := encode(enc.w, reflect.ValueOf(obj), escaper, false, 0, map[any]struct{}{}); err != nil { + if err := encode(enc.w, reflect.ValueOf(obj), escaper, enc.w.utf, false, 0, map[any]struct{}{}); err != nil { if rwe, ok := err.(*ReEncodeWriteError); ok { err = &EncodeWriteError{ Err: rwe.Err, @@ -108,7 +108,7 @@ func discardInt(_ int, err error) error { const startDetectingCyclesAfter = 1000 -func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote bool, cycleDepth uint, cycleSeen map[any]struct{}) error { +func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, utf InvalidUTF8Mode, quote bool, cycleDepth uint, cycleSeen map[any]struct{}) error { if !val.IsValid() { return discardInt(w.WriteString("null")) } @@ -197,7 +197,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo Err: err, } } - if err := jsonstring.EncodeStringFromBytes(w, escaper, text); err != nil { + if err := jsonstring.EncodeStringFromBytes(w, escaper, utf, val, text); err != nil { return err } default: @@ -309,14 +309,14 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo } else { if quote { var buf bytes.Buffer - if err := jsonstring.EncodeStringFromString(&buf, escaper, val.String()); err != nil { + if err := jsonstring.EncodeStringFromString(&buf, escaper, utf, val, val.String()); err != nil { return err } - if err := jsonstring.EncodeStringFromBytes(w, escaper, buf.Bytes()); err != nil { + if err := jsonstring.EncodeStringFromBytes(w, escaper, utf, val, buf.Bytes()); err != nil { return err } } else { - if err := jsonstring.EncodeStringFromString(w, escaper, val.String()); err != nil { + if err := jsonstring.EncodeStringFromString(w, escaper, utf, val, val.String()); err != nil { return err } } @@ -327,7 +327,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo return err } } else { - if err := encode(w, val.Elem(), escaper, quote, cycleDepth, cycleSeen); err != nil { + if err := encode(w, val.Elem(), escaper, utf, quote, cycleDepth, cycleSeen); err != nil { return err } } @@ -350,13 +350,13 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo } } empty = false - if err := jsonstring.EncodeStringFromString(w, escaper, field.Name); err != nil { + if err := jsonstring.EncodeStringFromString(w, escaper, utf, val, field.Name); err != nil { return err } if err := w.WriteByte(':'); err != nil { return err } - if err := encode(w, fVal, escaper, field.Quote, cycleDepth, cycleSeen); err != nil { + if err := encode(w, fVal, escaper, utf, field.Quote, cycleDepth, cycleSeen); err != nil { return err } } @@ -394,7 +394,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo for i := 0; iter.Next(); i++ { // TODO: Avoid buffering the map key var k strings.Builder - if err := encode(NewReEncoder(&k, ReEncoderConfig{BackslashEscape: escaper}), iter.Key(), escaper, false, cycleDepth, cycleSeen); err != nil { + if err := encode(NewReEncoder(&k, ReEncoderConfig{BackslashEscape: escaper, InvalidUTF8: utf}), iter.Key(), escaper, utf, false, cycleDepth, cycleSeen); err != nil { return err } kStr := k.String() @@ -403,7 +403,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo } if !strings.HasPrefix(kStr, `"`) { k.Reset() - if err := jsonstring.EncodeStringFromString(&k, escaper, kStr); err != nil { + if err := jsonstring.EncodeStringFromString(&k, escaper, utf, iter.Key(), kStr); err != nil { return err } kStr = k.String() @@ -427,7 +427,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo if err := w.WriteByte(':'); err != nil { return err } - if err := encode(w, kv.V, escaper, false, cycleDepth, cycleSeen); err != nil { + if err := encode(w, kv.V, escaper, utf, false, cycleDepth, cycleSeen); err != nil { return err } } @@ -491,12 +491,12 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo cycleSeen[ptr] = struct{}{} defer delete(cycleSeen, ptr) } - if err := encodeArray(w, val, escaper, cycleDepth, cycleSeen); err != nil { + if err := encodeArray(w, val, escaper, utf, cycleDepth, cycleSeen); err != nil { return err } } case reflect.Array: - if err := encodeArray(w, val, escaper, cycleDepth, cycleSeen); err != nil { + if err := encodeArray(w, val, escaper, utf, cycleDepth, cycleSeen); err != nil { return err } case reflect.Pointer: @@ -516,7 +516,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo cycleSeen[ptr] = struct{}{} defer delete(cycleSeen, ptr) } - if err := encode(w, val.Elem(), escaper, quote, cycleDepth, cycleSeen); err != nil { + if err := encode(w, val.Elem(), escaper, utf, quote, cycleDepth, cycleSeen); err != nil { return err } } @@ -529,7 +529,7 @@ func encode(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, quote boo return nil } -func encodeArray(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, cycleDepth uint, cycleSeen map[any]struct{}) error { +func encodeArray(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, utf InvalidUTF8Mode, cycleDepth uint, cycleSeen map[any]struct{}) error { if err := w.WriteByte('['); err != nil { return err } @@ -540,7 +540,7 @@ func encodeArray(w *ReEncoder, val reflect.Value, escaper BackslashEscaper, cycl return err } } - if err := encode(w, val.Index(i), escaper, false, cycleDepth, cycleSeen); err != nil { + if err := encode(w, val.Index(i), escaper, utf, false, cycleDepth, cycleSeen); err != nil { return err } } diff --git a/encode_escape.go b/encode_escape.go index 97da6e9..c9e2bc9 100644 --- a/encode_escape.go +++ b/encode_escape.go @@ -6,12 +6,29 @@ package lowmemjson import ( "fmt" - "unicode/utf8" "git.lukeshu.com/go/lowmemjson/internal/jsonstring" ) -// BackslashEscapeMode identifies one of the three ways that a +// InvalidUTF8Mode identifies one of the 3 ways that an Encoder or +// ReEncoder can behave when encountering invalid UTF-8 in a string +// value: +// +// - Replace the byte with the Unicode replacement character U+FFFD. +// +// - Allow the byte through to the string-encoder, with an +// escape-mode of BackslashEscapeRawByte. +// +// - Emit a syntax error. +type InvalidUTF8Mode = jsonstring.InvalidUTF8Mode + +const ( + InvalidUTF8Replace = jsonstring.InvalidUTF8Replace + InvalidUTF8Preserve = jsonstring.InvalidUTF8Preserve + InvalidUTF8Error = jsonstring.InvalidUTF8Error +) + +// BackslashEscapeMode identifies one of the four ways that a // character may be represented in a JSON string: // // - literally (no backslash escaping) @@ -20,12 +37,18 @@ import ( // single-character) // // - as a long Unicode `\uXXXX` backslash sequence +// +// - as a raw byte; this allows you to emit invalid JSON; JSON must +// be valid UTF-8, but this allows you to emit arbitrary binary +// data. If the character does not satisfy `utf8.RuneSelf <= char +// <= 0xFF`, then the encoder will panic. type BackslashEscapeMode = jsonstring.BackslashEscapeMode const ( BackslashEscapeNone = jsonstring.BackslashEscapeNone BackslashEscapeShort = jsonstring.BackslashEscapeShort BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode + BackslashEscapeRawByte = jsonstring.BackslashEscapeRawByte ) func hexToInt(c byte) rune { @@ -96,14 +119,13 @@ func EscapeHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode // behavior of encoding/json. // // It is like EscapeHTMLSafe, but also uses long Unicode `\uXXXX` -// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement -// character. +// sequences for `\b` and `\f` // // A ReEncoder uses EscapeDefault if a BackslashEscaper is not // specified. func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { switch c { - case '\b', '\f', utf8.RuneError: + case '\b', '\f': return BackslashEscapeUnicode default: return EscapeHTMLSafe(c, wasEscaped) @@ -115,11 +137,10 @@ func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { // SetEscapeHTML(false) called on it. // // It is like EscapeJSSafe, but also uses long Unicode `\uXXXX` -// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement -// character. +// sequences for `\b` and `\f`. func EscapeDefaultNonHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { switch c { - case '\b', '\f', utf8.RuneError: + case '\b', '\f': return BackslashEscapeUnicode default: return EscapeJSSafe(c, wasEscaped) diff --git a/errors.go b/errors.go index 516018c..da9de4d 100644 --- a/errors.go +++ b/errors.go @@ -142,7 +142,8 @@ func (e *EncodeWriteError) Unwrap() error { return e.Err } type EncodeTypeError = json.UnsupportedTypeError // An EncodeValueError is returned by Encode when attempting to encode -// an unsupported value (such as a datastructure with a cycle). +// an unsupported value (such as a datastructure with a cycle, or (if +// InvalidUTF8=InvalidUTF8Error) a string with invalid UTF-8). // // type UnsupportedValueError struct { // Value reflect.Value diff --git a/internal/jsonstring/encode_string.go b/internal/jsonstring/encode_string.go index fec2cc0..76bbb38 100644 --- a/internal/jsonstring/encode_string.go +++ b/internal/jsonstring/encode_string.go @@ -5,14 +5,25 @@ package jsonstring import ( + "encoding/json" "fmt" "io" + "reflect" "unicode/utf8" "git.lukeshu.com/go/lowmemjson/internal/fastio" "git.lukeshu.com/go/lowmemjson/internal/fastio/noescape" ) +// InvalidUTF8Mode is describe in the main lowmemjson package docs. +type InvalidUTF8Mode uint8 + +const ( + InvalidUTF8Replace InvalidUTF8Mode = iota + InvalidUTF8Preserve + InvalidUTF8Error +) + // BackslashEscapeMode is describe in the main lowmemjson package // docs. type BackslashEscapeMode uint8 @@ -21,6 +32,7 @@ const ( BackslashEscapeNone BackslashEscapeMode = iota BackslashEscapeShort BackslashEscapeUnicode + BackslashEscapeRawByte ) // BackslashEscaper is describe in the main lowmemjson package docs. @@ -96,19 +108,45 @@ func WriteStringChar(w fastio.AllWriter, c rune, escape BackslashEscapeMode) err default: // obey return writeStringUnicodeEscape(w, c) } + case BackslashEscapeRawByte: + switch { + case c < utf8.RuneSelf: + panic(fmt.Errorf("escaper returned BackslashEscapeRawByte for a character=%q < utf8.RuneSelf", c)) + case c > 0xFF: + panic(fmt.Errorf("escaper returned BackslashEscapeRawByte for a character=%q > 0xFF", c)) + default: + return w.WriteByte(byte(c)) + } default: - panic("escaper returned an invalid escape mode") + panic(fmt.Errorf("escaper returned an invalid escape mode=%d", escape)) } } -func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, str string) error { +func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, utf InvalidUTF8Mode, val reflect.Value, str string) error { if err := w.WriteByte('"'); err != nil { return err } - for _, c := range str { - if err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil { + for i := 0; i < len(str); { + escaped := BackslashEscapeNone + c, size := utf8.DecodeRuneInString(str[i:]) + if c == utf8.RuneError && size == 1 { + switch utf { + case InvalidUTF8Replace: + escaped = BackslashEscapeUnicode + case InvalidUTF8Preserve: + escaped = BackslashEscapeRawByte + c = rune(str[i]) + case InvalidUTF8Error: + return &json.UnsupportedValueError{ + Value: val, + Str: fmt.Sprintf("invalid UTF-8 at byte offset %d: %#02x", i, str[i]), + } + } + } + if err := WriteStringChar(w, c, escaper(c, escaped)); err != nil { return err } + i += size } if err := w.WriteByte('"'); err != nil { return err @@ -116,13 +154,28 @@ func EncodeStringFromString(w fastio.AllWriter, escaper BackslashEscaper, str st return nil } -func EncodeStringFromBytes(w fastio.AllWriter, escaper BackslashEscaper, str []byte) error { +func EncodeStringFromBytes(w fastio.AllWriter, escaper BackslashEscaper, utf InvalidUTF8Mode, val reflect.Value, str []byte) error { if err := w.WriteByte('"'); err != nil { return err } for i := 0; i < len(str); { + escaped := BackslashEscapeNone c, size := utf8.DecodeRune(str[i:]) - if err := WriteStringChar(w, c, escaper(c, BackslashEscapeNone)); err != nil { + if c == utf8.RuneError && size == 1 { + switch utf { + case InvalidUTF8Replace: + escaped = BackslashEscapeUnicode + case InvalidUTF8Preserve: + escaped = BackslashEscapeRawByte + c = rune(str[i]) + case InvalidUTF8Error: + return &json.UnsupportedValueError{ + Value: val, + Str: fmt.Sprintf("invalid UTF-8 at byte offset %d: %#02x", i, str[i]), + } + } + } + if err := WriteStringChar(w, c, escaper(c, escaped)); err != nil { return err } i += size diff --git a/reencode.go b/reencode.go index fd848f8..1a9999b 100644 --- a/reencode.go +++ b/reencode.go @@ -54,6 +54,13 @@ type ReEncoderConfig struct { // this is different than the usual behavior. ForceTrailingNewlines bool + // A JSON document is specified to be a sequence of Unicode + // codepoints; InvalidUTF8 controls how the *ReEncoder behaves + // when it encounters invalid UTF-8 bytes in a JSON string + // (i.e. the string is not representable as a sequence of + // Unicode codepoints, and thus the document is invalid JSON). + InvalidUTF8 InvalidUTF8Mode + // Returns whether a given character in a string should be // backslash-escaped. The bool argument is whether it was // \u-escaped in the input. This does not affect characters @@ -119,6 +126,7 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder { return &ReEncoder{ out: module, esc: escaper, + utf: cfg.InvalidUTF8, allowMultipleValues: cfg.AllowMultipleValues, } } @@ -134,6 +142,7 @@ func NewReEncoder(out io.Writer, cfg ReEncoderConfig) *ReEncoder { type ReEncoder struct { out reEncoderModule esc BackslashEscaper + utf InvalidUTF8Mode allowMultipleValues bool // state: .Write's/.WriteString's/.WriteRune's utf8-decoding buffer @@ -169,43 +178,54 @@ var ( _ io.Closer = (*ReEncoder)(nil) ) -func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full bool) { +func (enc *ReEncoder) getRuneFromBytes(str []byte, pos int) (c rune, size int, full, isRune bool) { + var tmp []byte if pos < enc.bufLen { - var tmp [utf8.UTFMax]byte - n := copy(tmp[:], enc.buf[pos:enc.bufLen]) - n += copy(tmp[n:], str) - c, size := utf8.DecodeRune(tmp[:n]) - if c == utf8.RuneError && size <= 1 { - return c, size, utf8.FullRune(tmp[:n]) - } - return c, size, true + var buf [utf8.UTFMax]byte + n := copy(buf[:], enc.buf[pos:enc.bufLen]) + n += copy(buf[n:], str) + tmp = buf[:n] } else { - tmp := str[pos-enc.bufLen:] - c, size := utf8.DecodeRune(tmp) - if c == utf8.RuneError && size <= 1 { - return c, size, utf8.FullRune(tmp) - } - return c, size, true + tmp = str[pos-enc.bufLen:] + } + c, size = utf8.DecodeRune(tmp) + switch { + case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp): + return c, size, false, true + case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace: + return rune(tmp[0]), 1, true, false + default: + return c, size, true, true } } -func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full bool) { +func (enc *ReEncoder) getRuneFromString(str string, pos int) (c rune, size int, full, isRune bool) { if pos < enc.bufLen { - var tmp [utf8.UTFMax]byte - n := copy(tmp[:], enc.buf[pos:enc.bufLen]) - n += copy(tmp[n:], str) - c, size := utf8.DecodeRune(tmp[:n]) - if c == utf8.RuneError && size <= 1 { - return c, size, utf8.FullRune(tmp[:n]) + var buf [utf8.UTFMax]byte + var tmp []byte + n := copy(buf[:], enc.buf[pos:enc.bufLen]) + n += copy(buf[n:], str) + tmp = buf[:n] + c, size = utf8.DecodeRune(tmp) + switch { + case c == utf8.RuneError && size <= 1 && !utf8.FullRune(tmp): + return c, size, false, true + case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace: + return rune(tmp[0]), 1, true, false + default: + return c, size, true, true } - return c, size, true } else { tmp := str[pos-enc.bufLen:] c, size := utf8.DecodeRuneInString(tmp) - if c == utf8.RuneError && size <= 1 { - return c, size, utf8.FullRuneInString(tmp) + switch { + case c == utf8.RuneError && size <= 1 && !utf8.FullRuneInString(tmp): + return c, size, false, true + case c == utf8.RuneError && size == 1 && enc.utf != InvalidUTF8Replace: + return rune(tmp[0]), 1, true, false + default: + return c, size, true, true } - return c, size, true } } @@ -223,7 +243,7 @@ func (enc *ReEncoder) Write(str []byte) (int, error) { } var n int for { - c, size, full := enc.getRuneFromBytes(str, n) + c, size, full, isRune := enc.getRuneFromBytes(str, n) if !full { if n < enc.bufLen { l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) @@ -234,7 +254,13 @@ func (enc *ReEncoder) Write(str []byte) (int, error) { } return len(str), nil } - enc.handleRune(c, size) + if enc.utf == InvalidUTF8Error && !isRune { + return n, &ReEncodeSyntaxError{ + Offset: enc.inputPos, + Err: fmt.Errorf("invalid UTF-8: %#02x", c), + } + } + enc.handleRune(c, size, isRune) if enc.err != nil { return n, enc.err } @@ -250,7 +276,7 @@ func (enc *ReEncoder) WriteString(str string) (int, error) { } var n int for { - c, size, full := enc.getRuneFromString(str, n) + c, size, full, isRune := enc.getRuneFromString(str, n) if !full { if n < enc.bufLen { l := copy(enc.buf[:], enc.buf[n:enc.bufLen]) @@ -261,7 +287,13 @@ func (enc *ReEncoder) WriteString(str string) (int, error) { } return len(str), nil } - enc.handleRune(c, size) + if enc.utf == InvalidUTF8Error && !isRune { + return n, &ReEncodeSyntaxError{ + Offset: enc.inputPos, + Err: fmt.Errorf("invalid UTF-8: %#02x", c), + } + } + enc.handleRune(c, size, isRune) if enc.err != nil { return n, enc.err } @@ -298,7 +330,7 @@ func (enc *ReEncoder) Close() error { return enc.err } if len(enc.barriers) == 0 { - if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize()); err != nil { + if err := enc.handleRuneType(0, jsonparse.RuneTypeEOF, enc.stackSize(), true); err != nil { enc.err = &ReEncodeWriteError{ Err: err, Offset: enc.inputPos, @@ -312,7 +344,8 @@ func (enc *ReEncoder) Close() error { return nil } -func (enc *ReEncoder) handleRune(c rune, size int) { +// isRune=false indicates that 'c' is a raw byte from invalid UTF-8. +func (enc *ReEncoder) handleRune(c rune, size int, isRune bool) { t, err := enc.par.HandleRune(c) if err != nil { enc.err = &ReEncodeSyntaxError{ @@ -321,7 +354,7 @@ func (enc *ReEncoder) handleRune(c rune, size int) { } return } - if err := enc.handleRuneType(c, t, enc.stackSize()); err != nil { + if err := enc.handleRuneType(c, t, enc.stackSize(), isRune); err != nil { enc.err = &ReEncodeWriteError{ Err: err, Offset: enc.inputPos, @@ -370,7 +403,7 @@ func (enc *ReEncoder) stackSize() int { return sz } -func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int) error { +func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int, isRune bool) error { switch t { case jsonparse.RuneTypeStringEsc, jsonparse.RuneTypeStringEscU: return nil @@ -410,6 +443,10 @@ func (enc *ReEncoder) handleRuneType(c rune, t jsonparse.RuneType, stackSize int if t > jsonparse.RuneTypeEOF { panic(fmt.Errorf("should not happen: handleRune called with %#v", t)) } - return enc.out.HandleRune(c, t, BackslashEscapeNone, stackSize) + esc := BackslashEscapeNone + if !isRune { + esc = BackslashEscapeRawByte + } + return enc.out.HandleRune(c, t, esc, stackSize) } } -- cgit v1.2.3