From b7093b386f8009a1c4f35f08185826fa2545fdb4 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Sun, 14 Aug 2022 12:04:10 -0600 Subject: cp reencode.go parse.go # [ci-skip] --- parse.go | 598 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 598 insertions(+) create mode 100644 parse.go (limited to 'parse.go') diff --git a/parse.go b/parse.go new file mode 100644 index 0000000..50c8ba3 --- /dev/null +++ b/parse.go @@ -0,0 +1,598 @@ +// Copyright (C) 2022 Luke Shumaker +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package lowmemjson + +import ( + "errors" + "fmt" + "io" + "unicode/utf8" +) + +type reencodeState func(rune) error + +type ReEncoder struct { + Out io.Writer + + // Whether to minify the JSON. + Compact bool + // String to use to indent; ignored if Compact is true. + Indent string + // String to put before indents, for testing-compat with + // encoding/json only. + prefix string + // Returns whether a given character in a string should be + // "\uXXXX" escaped. The bool argument is whether it was + // \u-escaped in the input. This does not affect characters + // that must or must-not be \u-escaped to be valid JSON. + // + // If not set, then EscapeUnicodeDefault is used. + UnicodeEscape func(rune, bool) bool + + bailAfterCurrent bool + + // state: .Write's utf8-decoding buffer + buf [utf8.UTFMax]byte + bufLen int + + // state: .WriteRune + err error + inputPos int64 + written int + stack []reencodeState + stack0IsNumber bool + curIndent int + + // state: reencodeState-specific + stateBuf []byte +} + +// public API ////////////////////////////////////////////////////////////////// + +func (enc *ReEncoder) Write(p []byte) (int, error) { + if len(p) == 0 { + return 0, nil + } + var n int + if enc.bufLen > 0 { + copy(enc.buf[enc.bufLen:], p) + c, size := utf8.DecodeRune(enc.buf[:]) + n += size - enc.bufLen + enc.bufLen = 0 + if _, err := enc.WriteRune(c); err != nil { + return 0, err + } + } + for utf8.FullRune(p[n:]) { + c, size := utf8.DecodeRune(p[n:]) + if _, err := enc.WriteRune(c); err != nil { + return n, err + } + n += size + } + enc.bufLen = copy(enc.buf[:], p[n:]) + return len(p), nil +} + +func (enc *ReEncoder) Flush() error { + if enc.bufLen > 0 { + return &SyntaxError{fmt.Sprintf("EOF: unflushed unicode garbage: %q", enc.buf[:enc.bufLen]), enc.inputPos} + } + switch len(enc.stack) { + case 0: + return nil + case 1: + if enc.stack0IsNumber { + enc.Compact = true + return enc.state('\n') + } + fallthrough + default: + return &SyntaxError{fmt.Sprintf("EOF: in the middle of a value"), enc.inputPos} + } +} + +func (enc *ReEncoder) WriteRune(c rune) (n int, err error) { + if enc.err != nil { + return 0, enc.err + } + if enc.bufLen != 0 { + enc.err = errors.New("lowmemjson.ReEncoder: cannot .WriteRune() when there is a partial rune that has been .Write()n") + return 0, enc.err + } + enc.written = 0 + enc.err = enc.state(c) + enc.inputPos += int64(utf8.RuneLen(c)) + return enc.written, enc.err +} + +// io helpers ////////////////////////////////////////////////////////////////// + +func (enc *ReEncoder) emitByte(c byte) error { + err := writeByte(enc.Out, c) + if err == nil { + enc.written++ + } + return err +} + +func (enc *ReEncoder) emit(n int, err error) error { + enc.written += n + return err +} + +func (enc *ReEncoder) nlIndent() error { + if enc.Compact || enc.Indent == "" { + return nil + } + if err := enc.emitByte('\n'); err != nil { + return err + } + if enc.prefix != "" { + if err := enc.emit(io.WriteString(enc.Out, enc.prefix)); err != nil { + return err + } + } + for i := 0; i < enc.curIndent; i++ { + if err := enc.emit(io.WriteString(enc.Out, enc.Indent)); err != nil { + return err + } + } + return nil +} + +// state helpers /////////////////////////////////////////////////////////////// + +func (enc *ReEncoder) pushState(state reencodeState, isNumber bool) { + if len(enc.stack) == 0 { + enc.stack0IsNumber = isNumber + } + enc.stack = append(enc.stack, state) +} +func (enc *ReEncoder) replaceState(state reencodeState, isNumber bool) { + if len(enc.stack) == 1 { + enc.stack0IsNumber = isNumber + } + enc.stack[len(enc.stack)-1] = state +} +func (enc *ReEncoder) popState() { + if len(enc.stack) == 1 { + enc.stack0IsNumber = false + } + enc.stack = enc.stack[:len(enc.stack)-1] +} + +var errBailedAfterCurrent = errors.New("bailed after current") + +func (enc *ReEncoder) state(c rune) error { + if len(enc.stack) == 0 { + if enc.bailAfterCurrent { + return errBailedAfterCurrent + } + enc.pushState(enc.stateAny, false) + } + return enc.stack[len(enc.stack)-1](c) +} + +// any ///////////////////////////////////////////////////////////////////////////////////////////// + +func (enc *ReEncoder) stateAny(c rune) error { + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + if enc.Compact || enc.Indent != "" { + return nil + } + case '{': + enc.replaceState(enc.stateInEmptyObject, false) + enc.curIndent++ + case '[': + enc.replaceState(enc.stateInEmptyArray, false) + enc.curIndent++ + case '"': + enc.replaceState(enc.stateInString, false) + case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + enc.replaceState(enc.stateNumberA, true) + return enc.state(c) + case 't': + enc.replaceState(enc.stateInTrue, false) + enc.stateBuf = append(enc.stateBuf[:0], 't') + case 'f': + enc.replaceState(enc.stateInFalse, false) + enc.stateBuf = append(enc.stateBuf[:0], 'f') + case 'n': + enc.replaceState(enc.stateInNull, false) + enc.stateBuf = append(enc.stateBuf[:0], 'n') + default: + return &SyntaxError{fmt.Sprintf("any: unexpected character: %c", c), enc.inputPos} + } + return enc.emitByte(byte(c)) +} + +// object ////////////////////////////////////////////////////////////////////////////////////////// + +func (enc *ReEncoder) stateInEmptyObject(c rune) error { return enc._stateInObject(c, false) } +func (enc *ReEncoder) stateInNonEmptyObject(c rune) error { return enc._stateInObject(c, true) } +func (enc *ReEncoder) _stateInObject(c rune, nonempty bool) error { + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + if enc.Compact || enc.Indent != "" { + return nil + } + case '"': + if err := enc.nlIndent(); err != nil { + return err + } + enc.replaceState(enc.stateInKV, false) + enc.pushState(enc.stateInString, false) + case '}': + enc.popState() + enc.curIndent-- + if nonempty { + if err := enc.nlIndent(); err != nil { + return err + } + } + default: + return &SyntaxError{fmt.Sprintf("object: unexpected character: %c", c), enc.inputPos} + } + return enc.emitByte(byte(c)) +} +func (enc *ReEncoder) stateInKV(c rune) error { + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + if enc.Compact || enc.Indent != "" { + return nil + } + return enc.emitByte(byte(c)) + case ':': + enc.replaceState(enc.stateAfterV, false) + enc.pushState(enc.stateAny, false) + if err := enc.emitByte(byte(c)); err != nil { + return err + } + if !enc.Compact && enc.Indent != "" { + return enc.emitByte(' ') + } + return nil + default: + return &SyntaxError{fmt.Sprintf("object member: unexpected character: %c", c), enc.inputPos} + } +} +func (enc *ReEncoder) stateAfterV(c rune) error { + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + if enc.Compact || enc.Indent != "" { + return nil + } + case ',': + enc.replaceState(enc.stateInNonEmptyObject, false) + case '}': + enc.popState() + enc.curIndent-- + if err := enc.nlIndent(); err != nil { + return err + } + default: + return &SyntaxError{fmt.Sprintf("object member: unexpected character: %c", c), enc.inputPos} + } + return enc.emitByte(byte(c)) +} + +// array /////////////////////////////////////////////////////////////////////////////////////////// + +func (enc *ReEncoder) stateInEmptyArray(c rune) error { return enc._stateInArray(c, false) } +func (enc *ReEncoder) stateInNonEmptyArray(c rune) error { return enc._stateInArray(c, true) } +func (enc *ReEncoder) _stateInArray(c rune, nonempty bool) error { + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + if enc.Compact || enc.Indent != "" { + return nil + } + case ']': + enc.popState() + enc.curIndent-- + if nonempty { + if err := enc.nlIndent(); err != nil { + return err + } + } + default: + if err := enc.nlIndent(); err != nil { + return err + } + enc.replaceState(enc.stateAfterItem, false) + enc.pushState(enc.stateAny, false) + return enc.state(c) + } + return enc.emitByte(byte(c)) +} +func (enc *ReEncoder) stateAfterItem(c rune) error { + switch c { + case 0x0020, 0x000A, 0x000D, 0x0009: + if enc.Compact || enc.Indent != "" { + return nil + } + case ',': + enc.replaceState(enc.stateInNonEmptyArray, false) + case ']': + enc.popState() + enc.curIndent-- + if err := enc.nlIndent(); err != nil { + return err + } + default: + return &SyntaxError{fmt.Sprintf("array: unexpected character: %c", c), enc.inputPos} + } + return enc.emitByte(byte(c)) +} + +// string ////////////////////////////////////////////////////////////////////////////////////////// + +func (enc *ReEncoder) stateInString(c rune) error { + switch { + case c == '\\': + enc.replaceState(enc.stateInBackslash, false) + return nil + case c == '"': + enc.popState() + return enc.emitByte(byte(c)) + case 0x0020 <= c && c <= 0x10FFFF: + return enc.emit(writeStringChar(enc.Out, c, false, enc.UnicodeEscape)) + default: + return &SyntaxError{fmt.Sprintf("string: unexpected character: %c", c), enc.inputPos} + } +} +func (enc *ReEncoder) stateInBackslash(c rune) error { + switch c { + case '"': + enc.replaceState(enc.stateInString, false) + return enc.emit(writeStringChar(enc.Out, '"', false, enc.UnicodeEscape)) + case '\\': + enc.replaceState(enc.stateInString, false) + return enc.emit(writeStringChar(enc.Out, '\\', false, enc.UnicodeEscape)) + case '/': + enc.replaceState(enc.stateInString, false) + return enc.emit(writeStringChar(enc.Out, '/', false, enc.UnicodeEscape)) + case 'b': + enc.replaceState(enc.stateInString, false) + return enc.emit(writeStringChar(enc.Out, '\b', false, enc.UnicodeEscape)) + case 'f': + enc.replaceState(enc.stateInString, false) + return enc.emit(writeStringChar(enc.Out, '\f', false, enc.UnicodeEscape)) + case 'n': + enc.replaceState(enc.stateInString, false) + return enc.emit(writeStringChar(enc.Out, '\n', false, enc.UnicodeEscape)) + case 'r': + enc.replaceState(enc.stateInString, false) + return enc.emit(writeStringChar(enc.Out, '\r', false, enc.UnicodeEscape)) + case 't': + enc.replaceState(enc.stateInString, false) + return enc.emit(writeStringChar(enc.Out, '\t', false, enc.UnicodeEscape)) + case 'u': + enc.replaceState(enc.stateInUnicode, false) + return nil + default: + return &SyntaxError{fmt.Sprintf("string backslash sequence: unexpected character: %c", c), enc.inputPos} + } +} +func (enc *ReEncoder) stateInUnicode(c rune) error { + switch { + case '0' <= c && c <= '9': + enc.stateBuf = append(enc.stateBuf, byte(c)-'0') + case 'a' <= c && c <= 'f': + enc.stateBuf = append(enc.stateBuf, byte(c)-'a'+10) + case 'A' <= c && c <= 'F': + enc.stateBuf = append(enc.stateBuf, byte(c)-'A'+10) + default: + return &SyntaxError{fmt.Sprintf("string unicode sequence: unexpected character: %c", c), enc.inputPos} + } + if len(enc.stateBuf) == 4 { + enc.replaceState(enc.stateInString, false) + c := 0 | + rune(enc.stateBuf[0])<<12 | + rune(enc.stateBuf[1])<<8 | + rune(enc.stateBuf[2])<<4 | + rune(enc.stateBuf[3])<<0 + enc.stateBuf = enc.stateBuf[:0] + return enc.emit(writeStringChar(enc.Out, c, true, enc.UnicodeEscape)) + } + return nil +} + +// number ////////////////////////////////////////////////////////////////////////////////////////// + +// Here's a flattened drawing of the syntax diagram from www.json.org : +// +// [------------ integer ----------][-- fraction ---][-------- exponent -------] +// >─╮─────╭─╮─"0"───────╭─────────╭──╮─────────────╭──╮───────────────────────╭─> +// │ │ │ │ │ │ │ │ │ +// ╰─"-"─╯ ╰─digit 1-9─╯─╭digit╮─╯ ╰─"."─╭digit╮─╯ ╰─"e"─╭─╮─────╭─╭digit╮─╯ +// ╰──<──╯ ╰──<──╯ │ │ │ │ ╰──<──╯ +// ╰─"E"─╯ ╰─"-"─╯ +// │ │ +// ╰─"+"─╯ +// +// Now here it is slightly redrawn, and with each distinct state our +// decoder can be in marked with a single-capital-letter: +// +// [-------------- integer ------------][--------- fraction --------][--------- exponent ---------] +// >─A─╮───────╭──╮─"0"─────────C─╭─────────╮──────────────────╭─────────╮──────────────────────────╭─> +// │ │ │ │ │ │ │ │ +// ╰─"-"─B─╯ ╰─digit 1-9─╭─D─╯─digit╮ ╰─"."─E─digit──╭─F─╯─digit╮ ╰─"e"─╭─G─╮─────╭─╭digit─H─╯ +// ╰────<─────╯ ╰────<─────╯ │ │ │ │ ╰────<───╯ +// ╰─"E"─╯ ╰─"-"─╯ +// │ │ +// ╰─"+"─╯ +// +// Which state we're at is the 'X' in 'stateNumberX'. +// +// Besides just traversing that, there are a few compressions we want to make: +// +// - trim trailing 0s from fraction the (but don't remove the +// fraction if it's all 0s); do this by making the F state a little +// special. This requires a little more state, because when we +// encounter the 0 we don't yet know if it's trailing. So, store +// the number of maybe-trailing zeros in enc.stateBuf[0]; if that +// reaches 255, then bleed over to enc.stateBuf[1] and so on. +// +// - trim leading 0s from the exponent (but don't remove the exponent +// if it's all 0s); do this by making the H state a little special. +// Record whether we've seen a non-zero digit in enc.stateBuf[0] +// (0=false, 1=true). + +// integer-part //////////////////////////////////////////////////////////////// +func (enc *ReEncoder) stateNumberA(c rune) error { // start + switch c { + case '-': + enc.replaceState(enc.stateNumberB, true) + case '0': + enc.replaceState(enc.stateNumberC, true) + case '1', '2', '3', '4', '5', '6', '7', '8', '9': + enc.replaceState(enc.stateNumberD, true) + default: + return &SyntaxError{fmt.Sprintf("number: unexpected character: %c", c), enc.inputPos} + } + return enc.emitByte(byte(c)) +} +func (enc *ReEncoder) stateNumberB(c rune) error { // got a leading "-" + switch c { + case '0': + enc.replaceState(enc.stateNumberC, true) + case '1', '2', '3', '4', '5', '6', '7', '8', '9': + enc.replaceState(enc.stateNumberD, true) + default: + return &SyntaxError{fmt.Sprintf("number: unexpected character: %c", c), enc.inputPos} + } + return enc.emitByte(byte(c)) +} +func (enc *ReEncoder) stateNumberC(c rune) error { // ready for the fraction or exponent part to start + switch c { + case '.': + enc.replaceState(enc.stateNumberE, true) + return enc.emitByte('.') + case 'e', 'E': + enc.replaceState(enc.stateNumberG, true) + enc.stateBuf = append(enc.stateBuf[:0], 0) + return enc.emitByte('e') + default: + enc.popState() + return enc.state(c) + } +} +func (enc *ReEncoder) stateNumberD(c rune) error { // in the integer part + switch c { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return enc.emitByte(byte(c)) + case '.': + enc.replaceState(enc.stateNumberE, true) + return enc.emitByte('.') + case 'e', 'E': + enc.replaceState(enc.stateNumberG, true) + enc.stateBuf = append(enc.stateBuf[:0], 0) + return enc.emitByte('e') + default: + enc.popState() + return enc.state(c) + } +} + +// fraction-part /////////////////////////////////////////////////////////////// +func (enc *ReEncoder) stateNumberE(c rune) error { // got a ".", ready to read a number for the fraction part + switch c { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + enc.replaceState(enc.stateNumberF, true) + return enc.emitByte(byte(c)) + default: + return &SyntaxError{fmt.Sprintf("number: unexpected character: %c", c), enc.inputPos} + } +} +func (enc *ReEncoder) stateNumberF(c rune) error { // in the fraction part + switch c { + case '0': + if len(enc.stateBuf) > 0 && enc.stateBuf[len(enc.stateBuf)-1] < 255 { + enc.stateBuf[len(enc.stateBuf)-1]++ + } else { + enc.stateBuf = append(enc.stateBuf, 1) + } + return nil + case '1', '2', '3', '4', '5', '6', '7', '8', '9': + for len(enc.stateBuf) > 0 { + if err := enc.emitByte('0'); err != nil { + return err + } + if enc.stateBuf[len(enc.stateBuf)-1] == 1 { + enc.stateBuf = enc.stateBuf[:len(enc.stateBuf)-1] + } else { + enc.stateBuf[len(enc.stateBuf)-1]-- + } + } + return enc.emitByte(byte(c)) + case 'e', 'E': + enc.replaceState(enc.stateNumberG, true) + enc.stateBuf = append(enc.stateBuf[:0], 0) + return enc.emitByte('e') + default: + enc.stateBuf = enc.stateBuf[:0] + enc.popState() + return enc.state(c) + } +} + +// exponent-part /////////////////////////////////////////////////////////////// +func (enc *ReEncoder) stateNumberG(c rune) error { // got a leading "e" + switch c { + case '-', '+': + enc.replaceState(enc.stateNumberH, true) + return enc.emitByte(byte(c)) + case '0': + enc.replaceState(enc.stateNumberH, true) + return nil + case '1', '2', '3', '4', '5', '6', '7', '8', '9': + enc.replaceState(enc.stateNumberH, true) + enc.stateBuf[0] = 1 + return enc.emitByte(byte(c)) + default: + enc.stateBuf = enc.stateBuf[:0] + return &SyntaxError{fmt.Sprintf("number: unexpected character: %c", c), enc.inputPos} + } +} +func (enc *ReEncoder) stateNumberH(c rune) error { // in the exponent's number part + switch c { + case '0': + if enc.stateBuf[0] == 0 { + return nil + } + return enc.emitByte('0') + case '1', '2', '3', '4', '5', '6', '7', '8', '9': + enc.stateBuf[0] = 1 + return enc.emitByte(byte(c)) + default: + if enc.stateBuf[0] == 0 { + if err := enc.emitByte('0'); err != nil { + return err + } + } + enc.stateBuf = enc.stateBuf[:0] + enc.popState() + return enc.state(c) + } +} + +// literals //////////////////////////////////////////////////////////////////////////////////////// + +func (enc *ReEncoder) stateInTrue(c rune) error { return enc._stateInLiteral(c, "true") } +func (enc *ReEncoder) stateInFalse(c rune) error { return enc._stateInLiteral(c, "false") } +func (enc *ReEncoder) stateInNull(c rune) error { return enc._stateInLiteral(c, "null") } +func (enc *ReEncoder) _stateInLiteral(c rune, full string) error { + if c != rune(full[len(enc.stateBuf)]) { + return &SyntaxError{fmt.Sprintf("%s: unexpected character: %c", full, c), enc.inputPos} + } + enc.stateBuf = append(enc.stateBuf, byte(c)) + if len(enc.stateBuf) == len(full) { + enc.stateBuf = enc.stateBuf[:0] + enc.popState() + } + return enc.emitByte(byte(c)) +} -- cgit v1.2.3-54-g00ecf