// Copyright (C) 2022-2023 Luke Shumaker // // SPDX-License-Identifier: GPL-2.0-or-later package lowmemjson import ( "fmt" "git.lukeshu.com/go/lowmemjson/internal/jsonstring" ) // InvalidUTF8Mode identifies one of the 3 ways that an Encoder or // ReEncoder can behave when encountering invalid UTF-8 in a string // value: // // - Replace the byte with the Unicode replacement character U+FFFD. // // - Allow the byte through to the string-encoder, with an // escape-mode of BackslashEscapeRawByte. // // - Emit a syntax error. type InvalidUTF8Mode = jsonstring.InvalidUTF8Mode const ( InvalidUTF8Replace = jsonstring.InvalidUTF8Replace InvalidUTF8Preserve = jsonstring.InvalidUTF8Preserve InvalidUTF8Error = jsonstring.InvalidUTF8Error ) // BackslashEscapeMode identifies one of the four ways that a // character may be represented in a JSON string: // // - literally (no backslash escaping) // // - as a short "well-known" `\X` backslash sequence (where `X` is a // single-character) // // - as a long Unicode `\uXXXX` backslash sequence (with 16 // permutations of capitalization) // // - as a raw byte; this allows you to emit invalid JSON; JSON must // be valid UTF-8, but this allows you to emit arbitrary binary // data. If the character does not satisfy `utf8.RuneSelf <= char // <= 0xFF`, then the encoder will panic. type BackslashEscapeMode = jsonstring.BackslashEscapeMode const ( BackslashEscapeNone = jsonstring.BackslashEscapeNone BackslashEscapeShort = jsonstring.BackslashEscapeShort BackslashEscapeRawByte = jsonstring.BackslashEscapeRawByte BackslashEscapeUnicodeXXXX = jsonstring.BackslashEscapeUnicodeXXXX BackslashEscapeUnicodeXXXx = jsonstring.BackslashEscapeUnicodeXXXx BackslashEscapeUnicodeXXxX = jsonstring.BackslashEscapeUnicodeXXxX BackslashEscapeUnicodeXXxx = jsonstring.BackslashEscapeUnicodeXXxx BackslashEscapeUnicodeXxXX = jsonstring.BackslashEscapeUnicodeXxXX BackslashEscapeUnicodeXxXx = jsonstring.BackslashEscapeUnicodeXxXx BackslashEscapeUnicodeXxxX = jsonstring.BackslashEscapeUnicodeXxxX BackslashEscapeUnicodeXxxx = jsonstring.BackslashEscapeUnicodeXxxx BackslashEscapeUnicodexXXX = jsonstring.BackslashEscapeUnicodexXXX BackslashEscapeUnicodexXXx = jsonstring.BackslashEscapeUnicodexXXx BackslashEscapeUnicodexXxX = jsonstring.BackslashEscapeUnicodexXxX BackslashEscapeUnicodexXxx = jsonstring.BackslashEscapeUnicodexXxx BackslashEscapeUnicodexxXX = jsonstring.BackslashEscapeUnicodexxXX BackslashEscapeUnicodexxXx = jsonstring.BackslashEscapeUnicodexxXx BackslashEscapeUnicodexxxX = jsonstring.BackslashEscapeUnicodexxxX BackslashEscapeUnicodexxxx = jsonstring.BackslashEscapeUnicodexxxx BackslashEscapeUnicodeMin = jsonstring.BackslashEscapeUnicodeMin BackslashEscapeUnicodeMax = jsonstring.BackslashEscapeUnicodeMax BackslashEscapeUnicode = jsonstring.BackslashEscapeUnicode // back-compat ) func hexToInt(c byte) rune { switch { case '0' <= c && c <= '9': return rune(c) - '0' case 'a' <= c && c <= 'f': return rune(c) - 'a' + 10 case 'A' <= c && c <= 'F': return rune(c) - 'A' + 10 default: panic(fmt.Errorf("should not happen: invalid hex char: %q", c)) } } func hexToRune(a, b, c, d byte) rune { return 0 | hexToInt(a)<<12 | hexToInt(b)<<8 | hexToInt(c)<<4 | hexToInt(d)<<0 } func hexToMode(a, b, c, d byte) BackslashEscapeMode { // The 0b0010_0000 bit is the ASCII "lowercase bit". return BackslashEscapeUnicodeMin + BackslashEscapeMode(0| ((a&0b0010_0000)>>2)| ((b&0b0010_0000)>>3)| ((c&0b0010_0000)>>4)| ((d&0b0010_0000)>>5)) } // A BackslashEscaper controls how a ReEncoder emits a character in a // JSON string. The `rune` argument is the character being // considered, and the `BackslashEscapeMode` argument is how it was // originally encoded in the input. // // The ReEncoder will panic if a BackslashEscaper returns an unknown // BackslashEscapeMode. However, a BackslashEscaper should be // permissive of BackslashEscapeModes it doesn't recognize; it is safe // to just return them unmodified. type BackslashEscaper = func(rune, BackslashEscapeMode) BackslashEscapeMode // EscapePreserve is a BackslashEscaper that preserves the original // input escaping. func EscapePreserve(_ rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { return wasEscaped } // EscapeJSSafe is a BackslashEscaper that escapes strings such that // the JSON safe to embed in JS; it otherwise preserves the original // input escaping. // // JSON is notionally a JS subset, but that's not actually true; so // more conservative backslash-escaping is necessary to safely embed // it in JS. http://timelessrepo.com/json-isnt-a-javascript-subset func EscapeJSSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { switch c { case '\u2028', '\u2029': return BackslashEscapeUnicode default: return wasEscaped } } // EscapeHTMLSafe is a BackslashEscaper that escapes strings such that // the JSON is safe to embed in HTML; it otherwise preserves the // original input escaping. func EscapeHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { switch c { case '&', '<', '>': return BackslashEscapeUnicode default: return EscapeJSSafe(c, wasEscaped) } } // EscapeDefault is a BackslashEscaper that mimics the default // behavior of encoding/json. // // It is like EscapeHTMLSafe, but also uses long Unicode `\uXXXX` // sequences for `\b` and `\f` // // A ReEncoder uses EscapeDefault if a BackslashEscaper is not // specified. func EscapeDefault(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { switch c { case '\b', '\f': return BackslashEscapeUnicode default: return EscapeHTMLSafe(c, wasEscaped) } } // EscapeDefaultNonHTMLSafe is a BackslashEscaper that mimics the // default behavior of an encoding/json.Encoder that has had // SetEscapeHTML(false) called on it. // // It is like EscapeJSSafe, but also uses long Unicode `\uXXXX` // sequences for `\b` and `\f`. func EscapeDefaultNonHTMLSafe(c rune, wasEscaped BackslashEscapeMode) BackslashEscapeMode { switch c { case '\b', '\f': return BackslashEscapeUnicode default: return EscapeJSSafe(c, wasEscaped) } }