encode, reencode: Fix handling of invalid UTF-8

author: Luke Shumaker <lukeshu@lukeshu.com> 2023-02-14 22:36:25 -0700
committer: Luke Shumaker <lukeshu@lukeshu.com> 2023-02-18 22:45:54 -0700
commit: dfc67cecbd95344d296c31b537fa3ae8aec8c292 (patch)
tree: 1e2e820cbd288d1ebef7b0e9dea14a07e2f33fc5 /encode_escape.go
parent: 38989a9c4f69abfe04c3eb4ec3382be88802141c (diff)
1 files changed, 29 insertions, 8 deletions
diff --git a/encode_escape.go b/encode_escape.go
index 97da6e9..c9e2bc9 100644
--- a/encode_escape.go
+++ b/encode_escape.go
@@ -6,12 +6,29 @@ package lowmemjson
 
 import (
 	"fmt"
-	"unicode/utf8"
 
 	"git.lukeshu.com/go/lowmemjson/internal/jsonstring"
 )
 
-// BackslashEscapeMode identifies one of the three ways that a
+// InvalidUTF8Mode identifies one of the 3 ways that an Encoder or
+// ReEncoder can behave when encountering invalid UTF-8 in a string
+// value:
+//
+//   - Replace the byte with the Unicode replacement character U+FFFD.
+//
+//   - Allow the byte through to the string-encoder, with an
+//     escape-mode of BackslashEscapeRawByte.
+//
+//   - Emit a syntax error.
+type InvalidUTF8Mode = jsonstring.InvalidUTF8Mode
+
+const (
+	InvalidUTF8Replace  = jsonstring.InvalidUTF8Replace
+	InvalidUTF8Preserve = jsonstring.InvalidUTF8Preserve
+	InvalidUTF8Error    = jsonstring.InvalidUTF8Error
+)
+
+// BackslashEscapeMode identifies one of the four ways that a
 // character may be represented in a JSON string:
 //
 //   - literally (no backslash escaping)
@@ -20,12 +37,18 @@ import (
 //     single-character)
 //
 //   - as a long Unicode `\uXXXX` backslash sequence
+//
+//   - as a raw byte; this allows you to emit invalid JSON; JSON must
+//     be valid UTF-8, but this allows you to emit arbitrary binary
+//     data.  If the character does not satisfy `utf8.RuneSelf <= char
+//     <= 0xFF`, then the encoder will panic.
 type BackslashEscapeMode = jsonstring.BackslashEscapeMode
 
 const (
 	BackslashEscapeNone    = jsonstring.BackslashEscapeNone
 	BackslashEscapeShort   = jsonstring.BackslashEscapeShort
 	BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode
+	BackslashEscapeRawByte = jsonstring.BackslashEscapeRawByte
 )
 
 func hexToInt(c byte) rune {
@@ -96,14 +119,13 @@ func EscapeHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode
 // behavior of encoding/json.
 //
 // It is like EscapeHTMLSafe, but also uses long Unicode `\uXXXX`
-// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement
-// character.
+// sequences for `\b` and `\f`
 //
 // A ReEncoder uses EscapeDefault if a BackslashEscaper is not
 // specified.
 func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
 	switch c {
-	case '\b', '\f', utf8.RuneError:
+	case '\b', '\f':
 		return BackslashEscapeUnicode
 	default:
 		return EscapeHTMLSafe(c, wasEscaped)
@@ -115,11 +137,10 @@ func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
 // SetEscapeHTML(false) called on it.
 //
 // It is like EscapeJSSafe, but also uses long Unicode `\uXXXX`
-// sequences for `\b`, `\f`, and the `\uFFFD` Unicode replacement
-// character.
+// sequences for `\b` and `\f`.
 func EscapeDefaultNonHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode {
 	switch c {
-	case '\b', '\f', utf8.RuneError:
+	case '\b', '\f':
 		return BackslashEscapeUnicode
 	default:
 		return EscapeJSSafe(c, wasEscaped)
author	Luke Shumaker <lukeshu@lukeshu.com>	2023-02-14 22:36:25 -0700
committer	Luke Shumaker <lukeshu@lukeshu.com>	2023-02-18 22:45:54 -0700
commit	dfc67cecbd95344d296c31b537fa3ae8aec8c292 (patch)
tree	1e2e820cbd288d1ebef7b0e9dea14a07e2f33fc5 /encode_escape.go
parent	38989a9c4f69abfe04c3eb4ec3382be88802141c (diff)