diff options
Diffstat (limited to 'src/libsystemd-terminal')
-rw-r--r-- | src/libsystemd-terminal/term-charset.c | 491 | ||||
-rw-r--r-- | src/libsystemd-terminal/term-internal.h | 349 | ||||
-rw-r--r-- | src/libsystemd-terminal/term-parser.c | 1626 | ||||
-rw-r--r-- | src/libsystemd-terminal/test-term-parser.c | 143 |
4 files changed, 2609 insertions, 0 deletions
diff --git a/src/libsystemd-terminal/term-charset.c b/src/libsystemd-terminal/term-charset.c new file mode 100644 index 0000000000..a00a1912da --- /dev/null +++ b/src/libsystemd-terminal/term-charset.c @@ -0,0 +1,491 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com> + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +/* + * VTE Character Sets + * These are predefined charactersets that can be loaded into GL and GR. By + * default we use unicode_lower and unicode_upper, that is, both sets have the + * exact unicode mapping. unicode_lower is effectively ASCII and unicode_upper + * as defined by the unicode standard (I guess, ISO 8859-1). + * Several other character sets are defined here. However, all of them are + * limited to the 96 character space of GL or GR. Everything beyond GR (which + * was not supported by the classic VTs by DEC but is available in VT emulators + * that support unicode/UTF8) is always mapped to unicode and cannot be changed + * by these character sets. Even mapping GL and GR is only available for + * backwards compatibility as new applications can use the Unicode functionality + * of the VTE. + * + * Moreover, mapping GR is almost unnecessary to support. In fact, Unicode UTF-8 + * support in VTE works by reading every incoming data as UTF-8 stream. This + * maps GL/ASCII to ASCII, as UTF-8 is backwards compatible to ASCII, however, + * everything that has the 8th bit set is a >=2-byte haracter in UTF-8. That is, + * this is in no way backwards compatible to >=VT220 8bit support. Therefore, if + * someone maps a character set into GR and wants to use them with this VTE, + * then they must already send UTF-8 characters to use GR (all GR characters are + * 8-bits). Hence, they can easily also send the correct UTF-8 character for the + * unicode mapping. + * The only advantage is that most characters in many sets are 3-byte UTF-8 + * characters and by mapping the set into GR/GL you can use 2 or 1 byte UTF-8 + * characters which saves bandwidth. + * Another reason is, if you have older applications that use the VT220 8-bit + * support and you put a ASCII/8bit-extension to UTF-8 converter in between, you + * need these mappings to have the application behave correctly if it uses GL/GR + * mappings extensively. + * + * Anyway, we support GL/GR mappings so here are the most commonly used maps as + * defined by Unicode-standard, DEC-private maps and other famous charmaps. + * + * Characters 1-32 are always the control characters (part of CL) and cannot be + * mapped. Characters 34-127 (94 characters) are part of GL and can be mapped. + * Characters 33 and 128 are not part of GL and always mapped by the VTE. + * However, for GR they can be mapped differently (96 chars) so we have to + * include them. The mapper has to take care not to use them in GL. + */ + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include "term-internal.h" + +/* + * Lower Unicode character set. This maps the characters to the basic ASCII + * characters 33-126. These are all graphics characters defined in ASCII. + */ +term_charset term_unicode_lower = { + [0] = 32, + [1] = 33, + [2] = 34, + [3] = 35, + [4] = 36, + [5] = 37, + [6] = 38, + [7] = 39, + [8] = 40, + [9] = 41, + [10] = 42, + [11] = 43, + [12] = 44, + [13] = 45, + [14] = 46, + [15] = 47, + [16] = 48, + [17] = 49, + [18] = 50, + [19] = 51, + [20] = 52, + [21] = 53, + [22] = 54, + [23] = 55, + [24] = 56, + [25] = 57, + [26] = 58, + [27] = 59, + [28] = 60, + [29] = 61, + [30] = 62, + [31] = 63, + [32] = 64, + [33] = 65, + [34] = 66, + [35] = 67, + [36] = 68, + [37] = 69, + [38] = 70, + [39] = 71, + [40] = 72, + [41] = 73, + [42] = 74, + [43] = 75, + [44] = 76, + [45] = 77, + [46] = 78, + [47] = 79, + [48] = 80, + [49] = 81, + [50] = 82, + [51] = 83, + [52] = 84, + [53] = 85, + [54] = 86, + [55] = 87, + [56] = 88, + [57] = 89, + [58] = 90, + [59] = 91, + [60] = 92, + [61] = 93, + [62] = 94, + [63] = 95, + [64] = 96, + [65] = 97, + [66] = 98, + [67] = 99, + [68] = 100, + [69] = 101, + [70] = 102, + [71] = 103, + [72] = 104, + [73] = 105, + [74] = 106, + [75] = 107, + [76] = 108, + [77] = 109, + [78] = 110, + [79] = 111, + [80] = 112, + [81] = 113, + [82] = 114, + [83] = 115, + [84] = 116, + [85] = 117, + [86] = 118, + [87] = 119, + [88] = 120, + [89] = 121, + [90] = 122, + [91] = 123, + [92] = 124, + [93] = 125, + [94] = 126, + [95] = 127, +}; + +/* + * Upper Unicode Table + * This maps all characters to the upper unicode characters 161-254. These are + * not compatible to any older 8 bit character sets. See the Unicode standard + * for the definitions of each symbol. + */ +term_charset term_unicode_upper = { + [0] = 160, + [1] = 161, + [2] = 162, + [3] = 163, + [4] = 164, + [5] = 165, + [6] = 166, + [7] = 167, + [8] = 168, + [9] = 169, + [10] = 170, + [11] = 171, + [12] = 172, + [13] = 173, + [14] = 174, + [15] = 175, + [16] = 176, + [17] = 177, + [18] = 178, + [19] = 179, + [20] = 180, + [21] = 181, + [22] = 182, + [23] = 183, + [24] = 184, + [25] = 185, + [26] = 186, + [27] = 187, + [28] = 188, + [29] = 189, + [30] = 190, + [31] = 191, + [32] = 192, + [33] = 193, + [34] = 194, + [35] = 195, + [36] = 196, + [37] = 197, + [38] = 198, + [39] = 199, + [40] = 200, + [41] = 201, + [42] = 202, + [43] = 203, + [44] = 204, + [45] = 205, + [46] = 206, + [47] = 207, + [48] = 208, + [49] = 209, + [50] = 210, + [51] = 211, + [52] = 212, + [53] = 213, + [54] = 214, + [55] = 215, + [56] = 216, + [57] = 217, + [58] = 218, + [59] = 219, + [60] = 220, + [61] = 221, + [62] = 222, + [63] = 223, + [64] = 224, + [65] = 225, + [66] = 226, + [67] = 227, + [68] = 228, + [69] = 229, + [70] = 230, + [71] = 231, + [72] = 232, + [73] = 233, + [74] = 234, + [75] = 235, + [76] = 236, + [77] = 237, + [78] = 238, + [79] = 239, + [80] = 240, + [81] = 241, + [82] = 242, + [83] = 243, + [84] = 244, + [85] = 245, + [86] = 246, + [87] = 247, + [88] = 248, + [89] = 249, + [90] = 250, + [91] = 251, + [92] = 252, + [93] = 253, + [94] = 254, + [95] = 255, +}; + +/* + * The DEC supplemental graphics set. For its definition see here: + * http://vt100.net/docs/vt220-rm/table2-3b.html + * Its basically a mixture of common European symbols that are not part of + * ASCII. Most often, this is mapped into GR to extend the basci ASCII part. + * + * This is very similar to unicode_upper, however, few symbols differ so do not + * mix them up! + */ +term_charset term_dec_supplemental_graphics = { + [0] = -1, /* undefined */ + [1] = 161, + [2] = 162, + [3] = 163, + [4] = 0, + [5] = 165, + [6] = 0, + [7] = 167, + [8] = 164, + [9] = 169, + [10] = 170, + [11] = 171, + [12] = 0, + [13] = 0, + [14] = 0, + [15] = 0, + [16] = 176, + [17] = 177, + [18] = 178, + [19] = 179, + [20] = 0, + [21] = 181, + [22] = 182, + [23] = 183, + [24] = 0, + [25] = 185, + [26] = 186, + [27] = 187, + [28] = 188, + [29] = 189, + [30] = 0, + [31] = 191, + [32] = 192, + [33] = 193, + [34] = 194, + [35] = 195, + [36] = 196, + [37] = 197, + [38] = 198, + [39] = 199, + [40] = 200, + [41] = 201, + [42] = 202, + [43] = 203, + [44] = 204, + [45] = 205, + [46] = 206, + [47] = 207, + [48] = 0, + [49] = 209, + [50] = 210, + [51] = 211, + [52] = 212, + [53] = 213, + [54] = 214, + [55] = 338, + [56] = 216, + [57] = 217, + [58] = 218, + [59] = 219, + [60] = 220, + [61] = 376, + [62] = 0, + [63] = 223, + [64] = 224, + [65] = 225, + [66] = 226, + [67] = 227, + [68] = 228, + [69] = 229, + [70] = 230, + [71] = 231, + [72] = 232, + [73] = 233, + [74] = 234, + [75] = 235, + [76] = 236, + [77] = 237, + [78] = 238, + [79] = 239, + [80] = 0, + [81] = 241, + [82] = 242, + [83] = 243, + [84] = 244, + [85] = 245, + [86] = 246, + [87] = 339, + [88] = 248, + [89] = 249, + [90] = 250, + [91] = 251, + [92] = 252, + [93] = 255, + [94] = 0, + [95] = -1, /* undefined */ +}; + +/* + * DEC special graphics character set. See here for its definition: + * http://vt100.net/docs/vt220-rm/table2-4.html + * This contains several characters to create ASCII drawings and similar. Its + * commonly mapped into GR to extend the basic ASCII characters. + * + * Lower 62 characters map to ASCII 33-64, everything beyond is special and + * commonly used for ASCII drawings. It depends on the Unicode Standard 3.2 for + * the extended horizontal scan-line characters 3, 5, 7, and 9. + */ +term_charset term_dec_special_graphics = { + [0] = -1, /* undefined */ + [1] = 33, + [2] = 34, + [3] = 35, + [4] = 36, + [5] = 37, + [6] = 38, + [7] = 39, + [8] = 40, + [9] = 41, + [10] = 42, + [11] = 43, + [12] = 44, + [13] = 45, + [14] = 46, + [15] = 47, + [16] = 48, + [17] = 49, + [18] = 50, + [19] = 51, + [20] = 52, + [21] = 53, + [22] = 54, + [23] = 55, + [24] = 56, + [25] = 57, + [26] = 58, + [27] = 59, + [28] = 60, + [29] = 61, + [30] = 62, + [31] = 63, + [32] = 64, + [33] = 65, + [34] = 66, + [35] = 67, + [36] = 68, + [37] = 69, + [38] = 70, + [39] = 71, + [40] = 72, + [41] = 73, + [42] = 74, + [43] = 75, + [44] = 76, + [45] = 77, + [46] = 78, + [47] = 79, + [48] = 80, + [49] = 81, + [50] = 82, + [51] = 83, + [52] = 84, + [53] = 85, + [54] = 86, + [55] = 87, + [56] = 88, + [57] = 89, + [58] = 90, + [59] = 91, + [60] = 92, + [61] = 93, + [62] = 94, + [63] = 0, + [64] = 9830, + [65] = 9618, + [66] = 9225, + [67] = 9228, + [68] = 9229, + [69] = 9226, + [70] = 176, + [71] = 177, + [72] = 9252, + [73] = 9227, + [74] = 9496, + [75] = 9488, + [76] = 9484, + [77] = 9492, + [78] = 9532, + [79] = 9146, + [80] = 9147, + [81] = 9472, + [82] = 9148, + [83] = 9149, + [84] = 9500, + [85] = 9508, + [86] = 9524, + [87] = 9516, + [88] = 9474, + [89] = 8804, + [90] = 8805, + [91] = 960, + [92] = 8800, + [93] = 163, + [94] = 8901, + [95] = -1, /* undefined */ +}; diff --git a/src/libsystemd-terminal/term-internal.h b/src/libsystemd-terminal/term-internal.h index d7d2f98b35..a3d1f5458b 100644 --- a/src/libsystemd-terminal/term-internal.h +++ b/src/libsystemd-terminal/term-internal.h @@ -37,6 +37,11 @@ typedef struct term_line term_line; typedef struct term_page term_page; typedef struct term_history term_history; +typedef struct term_utf8 term_utf8; +typedef struct term_seq term_seq; +typedef struct term_parser term_parser; +typedef uint32_t term_charset[96]; + /* * Miscellaneous * Sundry things and external helpers. @@ -335,3 +340,347 @@ void term_history_trim(term_history *history, unsigned int max); void term_history_push(term_history *history, term_line *line); term_line *term_history_pop(term_history *history, unsigned int reserve_width, const term_attr *attr, term_age_t age); unsigned int term_history_peek(term_history *history, unsigned int max, unsigned int reserve_width, const term_attr *attr, term_age_t age); + +/* + * UTF-8 + * The UTF-decoder and encoder are adjusted for terminals and provide proper + * fallbacks for invalid UTF-8. In terminals it's quite usual to use fallbacks + * instead of rejecting invalid input. This way, old legacy applications still + * work (this is especially important for 7bit/ASCII DEC modes). + */ + +struct term_utf8 { + uint32_t chars[5]; + uint32_t ucs4; + + unsigned int i_bytes : 3; + unsigned int n_bytes : 3; + unsigned int valid : 1; +}; + +size_t term_utf8_encode(char *out_utf8, uint32_t g); +const uint32_t *term_utf8_decode(term_utf8 *p, size_t *out_len, char c); + +/* + * Parsers + * The term_parser object parses control-sequences for both host and terminal + * side. Based on this parser, there is a set of command-parsers that take a + * term_seq sequence and returns the command it represents. This is different + * for host and terminal side so a different set of parsers is provided. + */ + +enum { + TERM_SEQ_NONE, /* placeholder, no sequence parsed */ + + TERM_SEQ_IGNORE, /* no-op character */ + TERM_SEQ_GRAPHIC, /* graphic character */ + TERM_SEQ_CONTROL, /* control character */ + TERM_SEQ_ESCAPE, /* escape sequence */ + TERM_SEQ_CSI, /* control sequence function */ + TERM_SEQ_DCS, /* device control string */ + TERM_SEQ_OSC, /* operating system control */ + + TERM_SEQ_CNT +}; + +enum { + /* these must be kept compatible to (1U << (ch - 0x20)) */ + + TERM_SEQ_FLAG_SPACE = (1U << 0), /* char: */ + TERM_SEQ_FLAG_BANG = (1U << 1), /* char: ! */ + TERM_SEQ_FLAG_DQUOTE = (1U << 2), /* char: " */ + TERM_SEQ_FLAG_HASH = (1U << 3), /* char: # */ + TERM_SEQ_FLAG_CASH = (1U << 4), /* char: $ */ + TERM_SEQ_FLAG_PERCENT = (1U << 5), /* char: % */ + TERM_SEQ_FLAG_AND = (1U << 6), /* char: & */ + TERM_SEQ_FLAG_SQUOTE = (1U << 7), /* char: ' */ + TERM_SEQ_FLAG_POPEN = (1U << 8), /* char: ( */ + TERM_SEQ_FLAG_PCLOSE = (1U << 9), /* char: ) */ + TERM_SEQ_FLAG_MULT = (1U << 10), /* char: * */ + TERM_SEQ_FLAG_PLUS = (1U << 11), /* char: + */ + TERM_SEQ_FLAG_COMMA = (1U << 12), /* char: , */ + TERM_SEQ_FLAG_MINUS = (1U << 13), /* char: - */ + TERM_SEQ_FLAG_DOT = (1U << 14), /* char: . */ + TERM_SEQ_FLAG_SLASH = (1U << 15), /* char: / */ + + /* 16-35 is reserved for numbers; unused */ + + /* COLON is reserved = (1U << 26), char: : */ + /* SEMICOLON is reserved = (1U << 27), char: ; */ + TERM_SEQ_FLAG_LT = (1U << 28), /* char: < */ + TERM_SEQ_FLAG_EQUAL = (1U << 29), /* char: = */ + TERM_SEQ_FLAG_GT = (1U << 30), /* char: > */ + TERM_SEQ_FLAG_WHAT = (1U << 31), /* char: ? */ +}; + +enum { + TERM_CMD_NONE, /* placeholder */ + TERM_CMD_GRAPHIC, /* graphics character */ + + TERM_CMD_BEL, /* bell */ + TERM_CMD_BS, /* backspace */ + TERM_CMD_CBT, /* cursor-backward-tabulation */ + TERM_CMD_CHA, /* cursor-horizontal-absolute */ + TERM_CMD_CHT, /* cursor-horizontal-forward-tabulation */ + TERM_CMD_CNL, /* cursor-next-line */ + TERM_CMD_CPL, /* cursor-previous-line */ + TERM_CMD_CR, /* carriage-return */ + TERM_CMD_CUB, /* cursor-backward */ + TERM_CMD_CUD, /* cursor-down */ + TERM_CMD_CUF, /* cursor-forward */ + TERM_CMD_CUP, /* cursor-position */ + TERM_CMD_CUU, /* cursor-up */ + TERM_CMD_DA1, /* primary-device-attributes */ + TERM_CMD_DA2, /* secondary-device-attributes */ + TERM_CMD_DA3, /* tertiary-device-attributes */ + TERM_CMD_DC1, /* device-control-1 */ + TERM_CMD_DC3, /* device-control-3 */ + TERM_CMD_DCH, /* delete-character */ + TERM_CMD_DECALN, /* screen-alignment-pattern */ + TERM_CMD_DECANM, /* ansi-mode */ + TERM_CMD_DECBI, /* back-index */ + TERM_CMD_DECCARA, /* change-attributes-in-rectangular-area */ + TERM_CMD_DECCRA, /* copy-rectangular-area */ + TERM_CMD_DECDC, /* delete-column */ + TERM_CMD_DECDHL_BH, /* double-width-double-height-line: bottom half */ + TERM_CMD_DECDHL_TH, /* double-width-double-height-line: top half */ + TERM_CMD_DECDWL, /* double-width-single-height-line */ + TERM_CMD_DECEFR, + TERM_CMD_DECELF, + TERM_CMD_DECELR, + TERM_CMD_DECERA, + TERM_CMD_DECFI, + TERM_CMD_DECFRA, + TERM_CMD_DECIC, + TERM_CMD_DECID, + TERM_CMD_DECINVM, + TERM_CMD_DECKBD, + TERM_CMD_DECKPAM, + TERM_CMD_DECKPNM, + TERM_CMD_DECLFKC, + TERM_CMD_DECLL, + TERM_CMD_DECLTOD, + TERM_CMD_DECPCTERM, + TERM_CMD_DECPKA, + TERM_CMD_DECPKFMR, + TERM_CMD_DECRARA, + TERM_CMD_DECRC, + TERM_CMD_DECREQTPARM, + TERM_CMD_DECRPKT, + TERM_CMD_DECRQCRA, + TERM_CMD_DECRQDE, + TERM_CMD_DECRQKT, + TERM_CMD_DECRQLP, + TERM_CMD_DECRQM_ANSI, + TERM_CMD_DECRQM_DEC, + TERM_CMD_DECRQPKFM, + TERM_CMD_DECRQPSR, + TERM_CMD_DECRQTSR, + TERM_CMD_DECRQUPSS, + TERM_CMD_DECSACE, + TERM_CMD_DECSASD, + TERM_CMD_DECSC, + TERM_CMD_DECSCA, + TERM_CMD_DECSCL, + TERM_CMD_DECSCP, + TERM_CMD_DECSCPP, + TERM_CMD_DECSCS, + TERM_CMD_DECSCUSR, + TERM_CMD_DECSDDT, + TERM_CMD_DECSDPT, + TERM_CMD_DECSED, + TERM_CMD_DECSEL, + TERM_CMD_DECSERA, + TERM_CMD_DECSFC, + TERM_CMD_DECSKCV, + TERM_CMD_DECSLCK, + TERM_CMD_DECSLE, + TERM_CMD_DECSLPP, + TERM_CMD_DECSLRM_OR_SC, + TERM_CMD_DECSMBV, + TERM_CMD_DECSMKR, + TERM_CMD_DECSNLS, + TERM_CMD_DECSPP, + TERM_CMD_DECSPPCS, + TERM_CMD_DECSPRTT, + TERM_CMD_DECSR, + TERM_CMD_DECSRFR, + TERM_CMD_DECSSCLS, + TERM_CMD_DECSSDT, + TERM_CMD_DECSSL, + TERM_CMD_DECST8C, + TERM_CMD_DECSTBM, + TERM_CMD_DECSTR, + TERM_CMD_DECSTRL, + TERM_CMD_DECSWBV, + TERM_CMD_DECSWL, + TERM_CMD_DECTID, + TERM_CMD_DECTME, + TERM_CMD_DECTST, + TERM_CMD_DL, + TERM_CMD_DSR_ANSI, + TERM_CMD_DSR_DEC, + TERM_CMD_ECH, + TERM_CMD_ED, + TERM_CMD_EL, + TERM_CMD_ENQ, + TERM_CMD_EPA, + TERM_CMD_FF, + TERM_CMD_HPA, + TERM_CMD_HPR, + TERM_CMD_HT, + TERM_CMD_HTS, + TERM_CMD_HVP, + TERM_CMD_ICH, + TERM_CMD_IL, + TERM_CMD_IND, + TERM_CMD_LF, + TERM_CMD_LS1R, + TERM_CMD_LS2, + TERM_CMD_LS2R, + TERM_CMD_LS3, + TERM_CMD_LS3R, + TERM_CMD_MC_ANSI, + TERM_CMD_MC_DEC, + TERM_CMD_NEL, + TERM_CMD_NP, + TERM_CMD_NULL, + TERM_CMD_PP, + TERM_CMD_PPA, + TERM_CMD_PPB, + TERM_CMD_PPR, + TERM_CMD_RC, + TERM_CMD_REP, + TERM_CMD_RI, + TERM_CMD_RIS, + TERM_CMD_RM_ANSI, + TERM_CMD_RM_DEC, + TERM_CMD_S7C1T, + TERM_CMD_S8C1T, + TERM_CMD_SCS, + TERM_CMD_SD, + TERM_CMD_SGR, + TERM_CMD_SI, + TERM_CMD_SM_ANSI, + TERM_CMD_SM_DEC, + TERM_CMD_SO, + TERM_CMD_SPA, + TERM_CMD_SS2, + TERM_CMD_SS3, + TERM_CMD_ST, + TERM_CMD_SU, + TERM_CMD_SUB, + TERM_CMD_TBC, + TERM_CMD_VPA, + TERM_CMD_VPR, + TERM_CMD_VT, + TERM_CMD_XTERM_CLLHP, /* xterm-cursor-lower-left-hp-bugfix */ + TERM_CMD_XTERM_IHMT, /* xterm-initiate-highlight-mouse-tracking*/ + TERM_CMD_XTERM_MLHP, /* xterm-memory-lock-hp-bugfix */ + TERM_CMD_XTERM_MUHP, /* xterm-memory-unlock-hp-bugfix */ + TERM_CMD_XTERM_RPM, /* xterm-restore-private-mode */ + TERM_CMD_XTERM_RRV, /* xterm-reset-resource-value */ + TERM_CMD_XTERM_RTM, /* xterm-reset-title-mode */ + TERM_CMD_XTERM_SACL1, /* xterm-set-ansi-conformance-level-1 */ + TERM_CMD_XTERM_SACL2, /* xterm-set-ansi-conformance-level-2 */ + TERM_CMD_XTERM_SACL3, /* xterm-set-ansi-conformance-level-3 */ + TERM_CMD_XTERM_SDCS, /* xterm-set-default-character-set */ + TERM_CMD_XTERM_SGFX, /* xterm-sixel-graphics */ + TERM_CMD_XTERM_SPM, /* xterm-set-private-mode */ + TERM_CMD_XTERM_SRV, /* xterm-set-resource-value */ + TERM_CMD_XTERM_STM, /* xterm-set-title-mode */ + TERM_CMD_XTERM_SUCS, /* xterm-set-utf8-character-set */ + TERM_CMD_XTERM_WM, /* xterm-window-management */ + + TERM_CMD_CNT +}; + +enum { + /* + * Charsets: DEC marks charsets according to "Digital Equ. Corp.". + * NRCS marks charsets according to the "National Replacement + * Character Sets". ISO marks charsets according to ISO-8859. + * The USERDEF charset is special and can be modified by the host. + */ + + TERM_CHARSET_NONE, + + /* 96-compat charsets */ + TERM_CHARSET_ISO_LATIN1_SUPPLEMENTAL, + TERM_CHARSET_BRITISH_NRCS = TERM_CHARSET_ISO_LATIN1_SUPPLEMENTAL, + TERM_CHARSET_ISO_LATIN2_SUPPLEMENTAL, + TERM_CHARSET_AMERICAN_NRCS = TERM_CHARSET_ISO_LATIN2_SUPPLEMENTAL, + TERM_CHARSET_ISO_LATIN5_SUPPLEMENTAL, + TERM_CHARSET_ISO_GREEK_SUPPLEMENTAL, + TERM_CHARSET_ISO_HEBREW_SUPPLEMENTAL, + TERM_CHARSET_ISO_LATIN_CYRILLIC, + + TERM_CHARSET_96_CNT, + + /* 94-compat charsets */ + TERM_CHARSET_DEC_SPECIAL_GRAPHIC = TERM_CHARSET_96_CNT, + TERM_CHARSET_DEC_SUPPLEMENTAL, + TERM_CHARSET_DEC_TECHNICAL, + TERM_CHARSET_CYRILLIC_DEC, + TERM_CHARSET_DUTCH_NRCS, + TERM_CHARSET_FINNISH_NRCS, + TERM_CHARSET_FRENCH_NRCS, + TERM_CHARSET_FRENCH_CANADIAN_NRCS, + TERM_CHARSET_GERMAN_NRCS, + TERM_CHARSET_GREEK_DEC, + TERM_CHARSET_GREEK_NRCS, + TERM_CHARSET_HEBREW_DEC, + TERM_CHARSET_HEBREW_NRCS, + TERM_CHARSET_ITALIAN_NRCS, + TERM_CHARSET_NORWEGIAN_DANISH_NRCS, + TERM_CHARSET_PORTUGUESE_NRCS, + TERM_CHARSET_RUSSIAN_NRCS, + TERM_CHARSET_SCS_NRCS, + TERM_CHARSET_SPANISH_NRCS, + TERM_CHARSET_SWEDISH_NRCS, + TERM_CHARSET_SWISS_NRCS, + TERM_CHARSET_TURKISH_DEC, + TERM_CHARSET_TURKISH_NRCS, + + TERM_CHARSET_94_CNT, + + /* special charsets */ + TERM_CHARSET_USERPREF_SUPPLEMENTAL = TERM_CHARSET_94_CNT, + + TERM_CHARSET_CNT, +}; + +extern term_charset term_unicode_lower; +extern term_charset term_unicode_upper; +extern term_charset term_dec_supplemental_graphics; +extern term_charset term_dec_special_graphics; + +#define TERM_PARSER_ARG_MAX (16) +#define TERM_PARSER_ST_MAX (4096) + +struct term_seq { + unsigned int type; + unsigned int command; + uint32_t terminator; + unsigned int intermediates; + unsigned int charset; + unsigned int n_args; + int args[TERM_PARSER_ARG_MAX]; + unsigned int n_st; + char *st; +}; + +struct term_parser { + term_seq seq; + size_t st_alloc; + unsigned int state; + + bool is_host : 1; +}; + +int term_parser_new(term_parser **out, bool host); +term_parser *term_parser_free(term_parser *parser); +int term_parser_feed(term_parser *parser, const term_seq **seq_out, uint32_t raw); + +#define _term_parser_free_ _cleanup_(term_parser_freep) +DEFINE_TRIVIAL_CLEANUP_FUNC(term_parser*, term_parser_free); diff --git a/src/libsystemd-terminal/term-parser.c b/src/libsystemd-terminal/term-parser.c new file mode 100644 index 0000000000..1c968520bd --- /dev/null +++ b/src/libsystemd-terminal/term-parser.c @@ -0,0 +1,1626 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com> + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +/* + * Terminal Parser + * This file contains a bunch of UTF-8 helpers and the main ctlseq-parser. The + * parser is a simple state-machine that correctly parses all CSI, DCS, OSC, ST + * control sequences and generic escape sequences. + * The parser itself does not perform any actions but lets the caller react to + * detected sequences. + */ + +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> +#include "macro.h" +#include "term-internal.h" +#include "util.h" + +/** + * term_utf8_encode() - Encode single UCS-4 character as UTF-8 + * @out_utf8: output buffer of at least 4 bytes or NULL + * @g: UCS-4 character to encode + * + * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8. + * The length of the character is returned. It is not zero-terminated! If the + * output buffer is NULL, only the length is returned. + * + * Returns: The length in bytes that the UTF-8 representation does or would + * occupy. + */ +size_t term_utf8_encode(char *out_utf8, uint32_t g) { + if (g < (1 << 7)) { + if (out_utf8) + out_utf8[0] = g & 0x7f; + return 1; + } else if (g < (1 << 11)) { + if (out_utf8) { + out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f); + out_utf8[1] = 0x80 | (g & 0x3f); + } + return 2; + } else if (g < (1 << 16)) { + if (out_utf8) { + out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f); + out_utf8[1] = 0x80 | ((g >> 6) & 0x3f); + out_utf8[2] = 0x80 | (g & 0x3f); + } + return 3; + } else if (g < (1 << 21)) { + if (out_utf8) { + out_utf8[0] = 0xf0 | ((g >> 18) & 0x07); + out_utf8[1] = 0x80 | ((g >> 12) & 0x3f); + out_utf8[2] = 0x80 | ((g >> 6) & 0x3f); + out_utf8[3] = 0x80 | (g & 0x3f); + } + return 4; + } else { + return 0; + } +} + +/** + * term_utf8_decode() - Try decoding the next UCS-4 character + * @p: decoder object to operate on or NULL + * @out_len: output buffer for length of decoded UCS-4 string or NULL + * @c: next char to push into decoder + * + * This decodes a UTF-8 stream. It must be called for each input-byte of the + * UTF-8 stream and returns a UCS-4 stream. The length of the returned UCS-4 + * string (number of parsed characters) is stored in @out_len if non-NULL. A + * pointer to the string is returned (or NULL if none was parsed). The string + * is not zero-terminated! Furthermore, the string is only valid until the next + * invokation of this function. It is also bound to the parser-state @p. + * + * This function is highly optimized to work with terminal-emulators. Instead + * of being strict about UTF-8 validity, this tries to perform a fallback to + * ISO-8859-1 in case a wrong series was detected. Therefore, this function + * might return multiple UCS-4 characters by parsing just a single UTF-8 byte. + * + * The parser state @p should be allocated and managed by the caller. There're + * no helpers to do that for you. To initialize it, simply reset it to all + * zero. You can reset or free the object at any point in time. + * + * Returns: Pointer to the UCS-4 string or NULL. + */ +const uint32_t *term_utf8_decode(term_utf8 *p, size_t *out_len, char c) { + uint32_t t, *res = NULL; + uint8_t byte; + size_t len = 0; + + if (!p) + goto out; + + byte = c; + + if (!p->valid || p->i_bytes >= p->n_bytes) { + /* + * If the previous sequence was invalid or fully parsed, start + * parsing a fresh new sequence. + */ + + if ((byte & 0xE0) == 0xC0) { + /* start of two byte sequence */ + t = byte & 0x1F; + p->n_bytes = 2; + p->i_bytes = 1; + p->valid = 1; + } else if ((byte & 0xF0) == 0xE0) { + /* start of three byte sequence */ + t = byte & 0x0F; + p->n_bytes = 3; + p->i_bytes = 1; + p->valid = 1; + } else if ((byte & 0xF8) == 0xF0) { + /* start of four byte sequence */ + t = byte & 0x07; + p->n_bytes = 4; + p->i_bytes = 1; + p->valid = 1; + } else { + /* Either of: + * - single ASCII 7-bit char + * - out-of-sync continuation byte + * - overlong encoding + * All of them are treated as single byte ISO-8859-1 */ + t = byte; + p->n_bytes = 1; + p->i_bytes = 1; + p->valid = 0; + } + + p->chars[0] = byte; + p->ucs4 = t << (6 * (p->n_bytes - p->i_bytes)); + } else { + /* + * ..otherwise, try to continue the previous sequence.. + */ + + if ((byte & 0xC0) == 0x80) { + /* + * Valid continuation byte. Append to sequence and + * update the ucs4 cache accordingly. + */ + + t = byte & 0x3F; + p->chars[p->i_bytes++] = byte; + p->ucs4 |= t << (6 * (p->n_bytes - p->i_bytes)); + } else { + /* + * Invalid continuation? Treat cached sequence as + * ISO-8859-1, but parse the new char as valid new + * starting character. If it's a new single-byte UTF-8 + * sequence, we immediately return it in the same run, + * otherwise, we might suffer from starvation. + */ + + if ((byte & 0xE0) == 0xC0 || + (byte & 0xF0) == 0xE0 || + (byte & 0xF8) == 0xF0) { + /* + * New multi-byte sequence. Move to-be-returned + * data at the end and start new sequence. Only + * return the old sequence. + */ + + memmove(p->chars + 1, + p->chars, + sizeof(*p->chars) * p->i_bytes); + res = p->chars + 1; + len = p->i_bytes; + + if ((byte & 0xE0) == 0xC0) { + /* start of two byte sequence */ + t = byte & 0x1F; + p->n_bytes = 2; + p->i_bytes = 1; + p->valid = 1; + } else if ((byte & 0xF0) == 0xE0) { + /* start of three byte sequence */ + t = byte & 0x0F; + p->n_bytes = 3; + p->i_bytes = 1; + p->valid = 1; + } else if ((byte & 0xF8) == 0xF0) { + /* start of four byte sequence */ + t = byte & 0x07; + p->n_bytes = 4; + p->i_bytes = 1; + p->valid = 1; + } + + p->chars[0] = byte; + p->ucs4 = t << (6 * (p->n_bytes - p->i_bytes)); + + goto out; + } else { + /* + * New single byte sequence, append to output + * and return combined sequence. + */ + + p->chars[p->i_bytes++] = byte; + p->valid = 0; + } + } + } + + /* + * Check whether a full sequence (valid or invalid) has been parsed and + * then return it. Otherwise, return nothing. + */ + if (p->valid) { + /* still parsing? then bail out */ + if (p->i_bytes < p->n_bytes) + goto out; + + res = &p->ucs4; + len = 1; + } else { + res = p->chars; + len = p->i_bytes; + } + + p->valid = 0; + p->i_bytes = 0; + p->n_bytes = 0; + +out: + if (out_len) + *out_len = len; + return len > 0 ? res : NULL; +} + +/* + * Command Parser + * The ctl-seq parser "term_parser" only detects whole sequences, it does not + * detect the specific command. Once a sequence is parsed, the command-parsers + * are used to figure out their meaning. Note that this depends on whether we + * run on the host or terminal side. + */ + +static unsigned int term_parse_host_control(const term_seq *seq) { + assert_return(seq, TERM_CMD_NONE); + + switch (seq->terminator) { + case 0x00: /* NUL */ + return TERM_CMD_NULL; + case 0x05: /* ENQ */ + return TERM_CMD_ENQ; + case 0x07: /* BEL */ + return TERM_CMD_BEL; + case 0x08: /* BS */ + return TERM_CMD_BS; + case 0x09: /* HT */ + return TERM_CMD_HT; + case 0x0a: /* LF */ + return TERM_CMD_LF; + case 0x0b: /* VT */ + return TERM_CMD_VT; + case 0x0c: /* FF */ + return TERM_CMD_FF; + case 0x0d: /* CR */ + return TERM_CMD_CR; + case 0x0e: /* SO */ + return TERM_CMD_SO; + case 0x0f: /* SI */ + return TERM_CMD_SI; + case 0x11: /* DC1 */ + return TERM_CMD_DC1; + case 0x13: /* DC3 */ + return TERM_CMD_DC3; + case 0x18: /* CAN */ + /* this is already handled by the state-machine */ + break; + case 0x1a: /* SUB */ + return TERM_CMD_SUB; + case 0x1b: /* ESC */ + /* this is already handled by the state-machine */ + break; + case 0x1f: /* DEL */ + /* this is already handled by the state-machine */ + break; + case 0x84: /* IND */ + return TERM_CMD_IND; + case 0x85: /* NEL */ + return TERM_CMD_NEL; + case 0x88: /* HTS */ + return TERM_CMD_HTS; + case 0x8d: /* RI */ + return TERM_CMD_RI; + case 0x8e: /* SS2 */ + return TERM_CMD_SS2; + case 0x8f: /* SS3 */ + return TERM_CMD_SS3; + case 0x90: /* DCS */ + /* this is already handled by the state-machine */ + break; + case 0x96: /* SPA */ + return TERM_CMD_SPA; + case 0x97: /* EPA */ + return TERM_CMD_EPA; + case 0x98: /* SOS */ + /* this is already handled by the state-machine */ + break; + case 0x9a: /* DECID */ + return TERM_CMD_DECID; + case 0x9b: /* CSI */ + /* this is already handled by the state-machine */ + break; + case 0x9c: /* ST */ + return TERM_CMD_ST; + case 0x9d: /* OSC */ + /* this is already handled by the state-machine */ + break; + case 0x9e: /* PM */ + /* this is already handled by the state-machine */ + break; + case 0x9f: /* APC */ + /* this is already handled by the state-machine */ + break; + } + + return TERM_CMD_NONE; +} + +static inline int charset_from_cmd(uint32_t raw, unsigned int flags, bool require_96) { + static const struct { + uint32_t raw; + unsigned int flags; + } charset_cmds[] = { + /* 96-compat charsets */ + [TERM_CHARSET_ISO_LATIN1_SUPPLEMENTAL] = { .raw = 'A', .flags = 0 }, + [TERM_CHARSET_ISO_LATIN2_SUPPLEMENTAL] = { .raw = 'B', .flags = 0 }, + [TERM_CHARSET_ISO_LATIN5_SUPPLEMENTAL] = { .raw = 'M', .flags = 0 }, + [TERM_CHARSET_ISO_GREEK_SUPPLEMENTAL] = { .raw = 'F', .flags = 0 }, + [TERM_CHARSET_ISO_HEBREW_SUPPLEMENTAL] = { .raw = 'H', .flags = 0 }, + [TERM_CHARSET_ISO_LATIN_CYRILLIC] = { .raw = 'L', .flags = 0 }, + + /* 94-compat charsets */ + [TERM_CHARSET_DEC_SPECIAL_GRAPHIC] = { .raw = '0', .flags = 0 }, + [TERM_CHARSET_DEC_SUPPLEMENTAL] = { .raw = '5', .flags = TERM_SEQ_FLAG_PERCENT }, + [TERM_CHARSET_DEC_TECHNICAL] = { .raw = '>', .flags = 0 }, + [TERM_CHARSET_CYRILLIC_DEC] = { .raw = '4', .flags = TERM_SEQ_FLAG_AND }, + [TERM_CHARSET_DUTCH_NRCS] = { .raw = '4', .flags = 0 }, + [TERM_CHARSET_FINNISH_NRCS] = { .raw = '5', .flags = 0 }, + [TERM_CHARSET_FRENCH_NRCS] = { .raw = 'R', .flags = 0 }, + [TERM_CHARSET_FRENCH_CANADIAN_NRCS] = { .raw = '9', .flags = 0 }, + [TERM_CHARSET_GERMAN_NRCS] = { .raw = 'K', .flags = 0 }, + [TERM_CHARSET_GREEK_DEC] = { .raw = '?', .flags = TERM_SEQ_FLAG_DQUOTE }, + [TERM_CHARSET_GREEK_NRCS] = { .raw = '>', .flags = TERM_SEQ_FLAG_DQUOTE }, + [TERM_CHARSET_HEBREW_DEC] = { .raw = '4', .flags = TERM_SEQ_FLAG_DQUOTE }, + [TERM_CHARSET_HEBREW_NRCS] = { .raw = '=', .flags = TERM_SEQ_FLAG_PERCENT }, + [TERM_CHARSET_ITALIAN_NRCS] = { .raw = 'Y', .flags = 0 }, + [TERM_CHARSET_NORWEGIAN_DANISH_NRCS] = { .raw = '`', .flags = 0 }, + [TERM_CHARSET_PORTUGUESE_NRCS] = { .raw = '6', .flags = TERM_SEQ_FLAG_PERCENT }, + [TERM_CHARSET_RUSSIAN_NRCS] = { .raw = '5', .flags = TERM_SEQ_FLAG_AND }, + [TERM_CHARSET_SCS_NRCS] = { .raw = '3', .flags = TERM_SEQ_FLAG_PERCENT }, + [TERM_CHARSET_SPANISH_NRCS] = { .raw = 'Z', .flags = 0 }, + [TERM_CHARSET_SWEDISH_NRCS] = { .raw = '7', .flags = 0 }, + [TERM_CHARSET_SWISS_NRCS] = { .raw = '=', .flags = 0 }, + [TERM_CHARSET_TURKISH_DEC] = { .raw = '0', .flags = TERM_SEQ_FLAG_PERCENT }, + [TERM_CHARSET_TURKISH_NRCS] = { .raw = '2', .flags = TERM_SEQ_FLAG_PERCENT }, + + /* special charsets */ + [TERM_CHARSET_USERPREF_SUPPLEMENTAL] = { .raw = '<', .flags = 0 }, + + /* secondary choices */ + [TERM_CHARSET_CNT + TERM_CHARSET_FINNISH_NRCS] = { .raw = 'C', .flags = 0 }, + [TERM_CHARSET_CNT + TERM_CHARSET_FRENCH_NRCS] = { .raw = 'f', .flags = 0 }, + [TERM_CHARSET_CNT + TERM_CHARSET_FRENCH_CANADIAN_NRCS] = { .raw = 'Q', .flags = 0 }, + [TERM_CHARSET_CNT + TERM_CHARSET_NORWEGIAN_DANISH_NRCS] = { .raw = 'E', .flags = 0 }, + [TERM_CHARSET_CNT + TERM_CHARSET_SWEDISH_NRCS] = { .raw = 'H', .flags = 0 }, /* unused; conflicts with ISO_HEBREW */ + + /* tertiary choices */ + [TERM_CHARSET_CNT + TERM_CHARSET_CNT + TERM_CHARSET_NORWEGIAN_DANISH_NRCS] = { .raw = '6', .flags = 0 }, + }; + size_t i, cs; + + /* + * Secondary choice on SWEDISH_NRCS and primary choice on + * ISO_HEBREW_SUPPLEMENTAL have a conflict: raw=="H", flags==0. + * We always choose the ISO 96-compat set, which is what VT510 does. + */ + + for (i = 0; i < ELEMENTSOF(charset_cmds); ++i) { + if (charset_cmds[i].raw == raw && charset_cmds[i].flags == flags) { + cs = i; + while (cs >= TERM_CHARSET_CNT) + cs -= TERM_CHARSET_CNT; + + if (!require_96 || cs < TERM_CHARSET_96_CNT || cs >= TERM_CHARSET_94_CNT) + return cs; + } + } + + return -ENOENT; +} + +/* true if exactly one bit in @value is set */ +static inline bool exactly_one_bit_set(unsigned int value) { + return __builtin_popcount(value) == 1; +} + +static unsigned int term_parse_host_escape(const term_seq *seq, unsigned int *cs_out) { + unsigned int t, flags; + int cs; + + assert_return(seq, TERM_CMD_NONE); + + flags = seq->intermediates; + t = TERM_SEQ_FLAG_POPEN | TERM_SEQ_FLAG_PCLOSE | TERM_SEQ_FLAG_MULT | + TERM_SEQ_FLAG_PLUS | TERM_SEQ_FLAG_MINUS | TERM_SEQ_FLAG_DOT | + TERM_SEQ_FLAG_SLASH; + + if (exactly_one_bit_set(flags & t)) { + switch (flags & t) { + case TERM_SEQ_FLAG_POPEN: + case TERM_SEQ_FLAG_PCLOSE: + case TERM_SEQ_FLAG_MULT: + case TERM_SEQ_FLAG_PLUS: + cs = charset_from_cmd(seq->terminator, flags & ~t, false); + break; + case TERM_SEQ_FLAG_MINUS: + case TERM_SEQ_FLAG_DOT: + case TERM_SEQ_FLAG_SLASH: + cs = charset_from_cmd(seq->terminator, flags & ~t, true); + break; + default: + cs = -ENOENT; + break; + } + + if (cs >= 0) { + if (cs_out) + *cs_out = cs; + return TERM_CMD_SCS; + } + + /* looked like a charset-cmd but wasn't; continue */ + } + + switch (seq->terminator) { + case '3': + if (flags == TERM_SEQ_FLAG_HASH) /* DECDHL top-half */ + return TERM_CMD_DECDHL_TH; + break; + case '4': + if (flags == TERM_SEQ_FLAG_HASH) /* DECDHL bottom-half */ + return TERM_CMD_DECDHL_BH; + break; + case '5': + if (flags == TERM_SEQ_FLAG_HASH) /* DECSWL */ + return TERM_CMD_DECSWL; + break; + case '6': + if (flags == 0) /* DECBI */ + return TERM_CMD_DECBI; + else if (flags == TERM_SEQ_FLAG_HASH) /* DECDWL */ + return TERM_CMD_DECDWL; + break; + case '7': + if (flags == 0) /* DECSC */ + return TERM_CMD_DECSC; + break; + case '8': + if (flags == 0) /* DECRC */ + return TERM_CMD_DECRC; + else if (flags == TERM_SEQ_FLAG_HASH) /* DECALN */ + return TERM_CMD_DECALN; + break; + case '9': + if (flags == 0) /* DECFI */ + return TERM_CMD_DECFI; + break; + case '<': + if (flags == 0) /* DECANM */ + return TERM_CMD_DECANM; + break; + case '=': + if (flags == 0) /* DECKPAM */ + return TERM_CMD_DECKPAM; + break; + case '>': + if (flags == 0) /* DECKPNM */ + return TERM_CMD_DECKPNM; + break; + case '@': + if (flags == TERM_SEQ_FLAG_PERCENT) { + /* Select default character set */ + return TERM_CMD_XTERM_SDCS; + } + break; + case 'D': + if (flags == 0) /* IND */ + return TERM_CMD_IND; + break; + case 'E': + if (flags == 0) /* NEL */ + return TERM_CMD_NEL; + break; + case 'F': + if (flags == 0) /* Cursor to lower-left corner of screen */ + return TERM_CMD_XTERM_CLLHP; + else if (flags == TERM_SEQ_FLAG_SPACE) /* S7C1T */ + return TERM_CMD_S7C1T; + break; + case 'G': + if (flags == TERM_SEQ_FLAG_SPACE) { /* S8C1T */ + return TERM_CMD_S8C1T; + } else if (flags == TERM_SEQ_FLAG_PERCENT) { + /* Select UTF-8 character set */ + return TERM_CMD_XTERM_SUCS; + } + break; + case 'H': + if (flags == 0) /* HTS */ + return TERM_CMD_HTS; + break; + case 'L': + if (flags == TERM_SEQ_FLAG_SPACE) { + /* Set ANSI conformance level 1 */ + return TERM_CMD_XTERM_SACL1; + } + break; + case 'M': + if (flags == 0) { /* RI */ + return TERM_CMD_RI; + } else if (flags == TERM_SEQ_FLAG_SPACE) { + /* Set ANSI conformance level 2 */ + return TERM_CMD_XTERM_SACL2; + } + break; + case 'N': + if (flags == 0) { /* SS2 */ + return TERM_CMD_SS2; + } else if (flags == TERM_SEQ_FLAG_SPACE) { + /* Set ANSI conformance level 3 */ + return TERM_CMD_XTERM_SACL3; + } + break; + case 'O': + if (flags == 0) /* SS3 */ + return TERM_CMD_SS3; + break; + case 'P': + if (flags == 0) /* DCS: this is already handled by the state-machine */ + return 0; + break; + case 'V': + if (flags == 0) /* SPA */ + return TERM_CMD_SPA; + break; + case 'W': + if (flags == 0) /* EPA */ + return TERM_CMD_EPA; + break; + case 'X': + if (flags == 0) { /* SOS */ + /* this is already handled by the state-machine */ + break; + } + break; + case 'Z': + if (flags == 0) /* DECID */ + return TERM_CMD_DECID; + break; + case '[': + if (flags == 0) { /* CSI */ + /* this is already handled by the state-machine */ + break; + } + break; + case '\\': + if (flags == 0) /* ST */ + return TERM_CMD_ST; + break; + case ']': + if (flags == 0) { /* OSC */ + /* this is already handled by the state-machine */ + break; + } + break; + case '^': + if (flags == 0) { /* PM */ + /* this is already handled by the state-machine */ + break; + } + break; + case '_': + if (flags == 0) { /* APC */ + /* this is already handled by the state-machine */ + break; + } + break; + case 'c': + if (flags == 0) /* RIS */ + return TERM_CMD_RIS; + break; + case 'l': + if (flags == 0) /* Memory lock */ + return TERM_CMD_XTERM_MLHP; + break; + case 'm': + if (flags == 0) /* Memory unlock */ + return TERM_CMD_XTERM_MUHP; + break; + case 'n': + if (flags == 0) /* LS2 */ + return TERM_CMD_LS2; + break; + case 'o': + if (flags == 0) /* LS3 */ + return TERM_CMD_LS3; + break; + case '|': + if (flags == 0) /* LS3R */ + return TERM_CMD_LS3R; + break; + case '}': + if (flags == 0) /* LS2R */ + return TERM_CMD_LS2R; + break; + case '~': + if (flags == 0) /* LS1R */ + return TERM_CMD_LS1R; + break; + } + + return TERM_CMD_NONE; +} + +static unsigned int term_parse_host_csi(const term_seq *seq) { + unsigned int flags; + + assert_return(seq, TERM_CMD_NONE); + + flags = seq->intermediates; + + switch (seq->terminator) { + case 'A': + if (flags == 0) /* CUU */ + return TERM_CMD_CUU; + break; + case 'a': + if (flags == 0) /* HPR */ + return TERM_CMD_HPR; + break; + case 'B': + if (flags == 0) /* CUD */ + return TERM_CMD_CUD; + break; + case 'b': + if (flags == 0) /* REP */ + return TERM_CMD_REP; + break; + case 'C': + if (flags == 0) /* CUF */ + return TERM_CMD_CUF; + break; + case 'c': + if (flags == 0) /* DA1 */ + return TERM_CMD_DA1; + else if (flags == TERM_SEQ_FLAG_GT) /* DA2 */ + return TERM_CMD_DA2; + else if (flags == TERM_SEQ_FLAG_EQUAL) /* DA3 */ + return TERM_CMD_DA3; + break; + case 'D': + if (flags == 0) /* CUB */ + return TERM_CMD_CUB; + break; + case 'd': + if (flags == 0) /* VPA */ + return TERM_CMD_VPA; + break; + case 'E': + if (flags == 0) /* CNL */ + return TERM_CMD_CNL; + break; + case 'e': + if (flags == 0) /* VPR */ + return TERM_CMD_VPR; + break; + case 'F': + if (flags == 0) /* CPL */ + return TERM_CMD_CPL; + break; + case 'f': + if (flags == 0) /* HVP */ + return TERM_CMD_HVP; + break; + case 'G': + if (flags == 0) /* CHA */ + return TERM_CMD_CHA; + break; + case 'g': + if (flags == 0) /* TBC */ + return TERM_CMD_TBC; + else if (flags == TERM_SEQ_FLAG_MULT) /* DECLFKC */ + return TERM_CMD_DECLFKC; + break; + case 'H': + if (flags == 0) /* CUP */ + return TERM_CMD_CUP; + break; + case 'h': + if (flags == 0) /* SM ANSI */ + return TERM_CMD_SM_ANSI; + else if (flags == TERM_SEQ_FLAG_WHAT) /* SM DEC */ + return TERM_CMD_SM_DEC; + break; + case 'I': + if (flags == 0) /* CHT */ + return TERM_CMD_CHT; + break; + case 'i': + if (flags == 0) /* MC ANSI */ + return TERM_CMD_MC_ANSI; + else if (flags == TERM_SEQ_FLAG_WHAT) /* MC DEC */ + return TERM_CMD_MC_DEC; + break; + case 'J': + if (flags == 0) /* ED */ + return TERM_CMD_ED; + else if (flags == TERM_SEQ_FLAG_WHAT) /* DECSED */ + return TERM_CMD_DECSED; + break; + case 'K': + if (flags == 0) /* EL */ + return TERM_CMD_EL; + else if (flags == TERM_SEQ_FLAG_WHAT) /* DECSEL */ + return TERM_CMD_DECSEL; + break; + case 'L': + if (flags == 0) /* IL */ + return TERM_CMD_IL; + break; + case 'l': + if (flags == 0) /* RM ANSI */ + return TERM_CMD_RM_ANSI; + else if (flags == TERM_SEQ_FLAG_WHAT) /* RM DEC */ + return TERM_CMD_RM_DEC; + break; + case 'M': + if (flags == 0) /* DL */ + return TERM_CMD_DL; + break; + case 'm': + if (flags == 0) /* SGR */ + return TERM_CMD_SGR; + else if (flags == TERM_SEQ_FLAG_GT) /* XTERM SMR */ + return TERM_CMD_XTERM_SRV; + break; + case 'n': + if (flags == 0) /* DSR ANSI */ + return TERM_CMD_DSR_ANSI; + else if (flags == TERM_SEQ_FLAG_GT) /* XTERM RMR */ + return TERM_CMD_XTERM_RRV; + else if (flags == TERM_SEQ_FLAG_WHAT) /* DSR DEC */ + return TERM_CMD_DSR_DEC; + break; + case 'P': + if (flags == 0) /* DCH */ + return TERM_CMD_DCH; + else if (flags == TERM_SEQ_FLAG_SPACE) /* PPA */ + return TERM_CMD_PPA; + break; + case 'p': + if (flags == 0) /* DECSSL */ + return TERM_CMD_DECSSL; + else if (flags == TERM_SEQ_FLAG_SPACE) /* DECSSCLS */ + return TERM_CMD_DECSSCLS; + else if (flags == TERM_SEQ_FLAG_BANG) /* DECSTR */ + return TERM_CMD_DECSTR; + else if (flags == TERM_SEQ_FLAG_DQUOTE) /* DECSCL */ + return TERM_CMD_DECSCL; + else if (flags == TERM_SEQ_FLAG_CASH) /* DECRQM-ANSI */ + return TERM_CMD_DECRQM_ANSI; + else if (flags == (TERM_SEQ_FLAG_CASH | TERM_SEQ_FLAG_WHAT)) /* DECRQM-DEC */ + return TERM_CMD_DECRQM_DEC; + else if (flags == TERM_SEQ_FLAG_PCLOSE) /* DECSDPT */ + return TERM_CMD_DECSDPT; + else if (flags == TERM_SEQ_FLAG_MULT) /* DECSPPCS */ + return TERM_CMD_DECSPPCS; + else if (flags == TERM_SEQ_FLAG_PLUS) /* DECSR */ + return TERM_CMD_DECSR; + else if (flags == TERM_SEQ_FLAG_COMMA) /* DECLTOD */ + return TERM_CMD_DECLTOD; + else if (flags == TERM_SEQ_FLAG_GT) /* XTERM SPM */ + return TERM_CMD_XTERM_SPM; + break; + case 'Q': + if (flags == TERM_SEQ_FLAG_SPACE) /* PPR */ + return TERM_CMD_PPR; + break; + case 'q': + if (flags == 0) /* DECLL */ + return TERM_CMD_DECLL; + else if (flags == TERM_SEQ_FLAG_SPACE) /* DECSCUSR */ + return TERM_CMD_DECSCUSR; + else if (flags == TERM_SEQ_FLAG_DQUOTE) /* DECSCA */ + return TERM_CMD_DECSCA; + else if (flags == TERM_SEQ_FLAG_CASH) /* DECSDDT */ + return TERM_CMD_DECSDDT; + else if (flags == TERM_SEQ_FLAG_MULT) /* DECSRC */ + return TERM_CMD_DECSR; + else if (flags == TERM_SEQ_FLAG_PLUS) /* DECELF */ + return TERM_CMD_DECELF; + else if (flags == TERM_SEQ_FLAG_COMMA) /* DECTID */ + return TERM_CMD_DECTID; + break; + case 'R': + if (flags == TERM_SEQ_FLAG_SPACE) /* PPB */ + return TERM_CMD_PPB; + break; + case 'r': + if (flags == 0) { + /* DECSTBM */ + return TERM_CMD_DECSTBM; + } else if (flags == TERM_SEQ_FLAG_SPACE) { + /* DECSKCV */ + return TERM_CMD_DECSKCV; + } else if (flags == TERM_SEQ_FLAG_CASH) { + /* DECCARA */ + return TERM_CMD_DECCARA; + } else if (flags == TERM_SEQ_FLAG_MULT) { + /* DECSCS */ + return TERM_CMD_DECSCS; + } else if (flags == TERM_SEQ_FLAG_PLUS) { + /* DECSMKR */ + return TERM_CMD_DECSMKR; + } else if (flags == TERM_SEQ_FLAG_WHAT) { + /* + * There's a conflict between DECPCTERM and XTERM-RPM. + * XTERM-RPM takes a single argument, DECPCTERM takes 2. + * Split both up and forward the call to the closer + * match. + */ + if (seq->n_args <= 1) /* XTERM RPM */ + return TERM_CMD_XTERM_RPM; + else if (seq->n_args >= 2) /* DECPCTERM */ + return TERM_CMD_DECPCTERM; + } + break; + case 'S': + if (flags == 0) /* SU */ + return TERM_CMD_SU; + else if (flags == TERM_SEQ_FLAG_WHAT) /* XTERM SGFX */ + return TERM_CMD_XTERM_SGFX; + break; + case 's': + if (flags == 0) { + /* + * There's a conflict between DECSLRM and SC-ANSI which + * cannot be resolved without knowing the state of + * DECLRMM. We leave that decision up to the caller. + */ + return TERM_CMD_DECSLRM_OR_SC; + } else if (flags == TERM_SEQ_FLAG_CASH) { + /* DECSPRTT */ + return TERM_CMD_DECSPRTT; + } else if (flags == TERM_SEQ_FLAG_MULT) { + /* DECSFC */ + return TERM_CMD_DECSFC; + } else if (flags == TERM_SEQ_FLAG_WHAT) { + /* XTERM SPM */ + return TERM_CMD_XTERM_SPM; + } + break; + case 'T': + if (flags == 0) { + /* + * Awesome: There's a conflict between SD and XTERM IHMT + * that we have to resolve by checking the parameter + * count.. XTERM_IHMT needs exactly 5 arguments, SD + * takes 0 or 1. We're conservative here and give both + * a wider range to allow unused arguments (compat...). + */ + if (seq->n_args >= 5) { + /* XTERM IHMT */ + return TERM_CMD_XTERM_IHMT; + } else if (seq->n_args < 5) { + /* SD */ + return TERM_CMD_SD; + } + } else if (flags == TERM_SEQ_FLAG_GT) { + /* XTERM RTM */ + return TERM_CMD_XTERM_RTM; + } + break; + case 't': + if (flags == 0) { + if (seq->n_args > 0 && seq->args[0] < 24) { + /* XTERM WM */ + return TERM_CMD_XTERM_WM; + } else { + /* DECSLPP */ + return TERM_CMD_DECSLPP; + } + } else if (flags == TERM_SEQ_FLAG_SPACE) { + /* DECSWBV */ + return TERM_CMD_DECSWBV; + } else if (flags == TERM_SEQ_FLAG_DQUOTE) { + /* DECSRFR */ + return TERM_CMD_DECSRFR; + } else if (flags == TERM_SEQ_FLAG_CASH) { + /* DECRARA */ + return TERM_CMD_DECRARA; + } else if (flags == TERM_SEQ_FLAG_GT) { + /* XTERM STM */ + return TERM_CMD_XTERM_STM; + } + break; + case 'U': + if (flags == 0) /* NP */ + return TERM_CMD_NP; + break; + case 'u': + if (flags == 0) { + /* RC */ + return TERM_CMD_RC; + } else if (flags == TERM_SEQ_FLAG_SPACE) { + /* DECSMBV */ + return TERM_CMD_DECSMBV; + } else if (flags == TERM_SEQ_FLAG_DQUOTE) { + /* DECSTRL */ + return TERM_CMD_DECSTRL; + } else if (flags == TERM_SEQ_FLAG_WHAT) { + /* DECRQUPSS */ + return TERM_CMD_DECRQUPSS; + } else if (seq->args[0] == 1 && flags == TERM_SEQ_FLAG_CASH) { + /* DECRQTSR */ + return TERM_CMD_DECRQTSR; + } else if (flags == TERM_SEQ_FLAG_MULT) { + /* DECSCP */ + return TERM_CMD_DECSCP; + } else if (flags == TERM_SEQ_FLAG_COMMA) { + /* DECRQKT */ + return TERM_CMD_DECRQKT; + } + break; + case 'V': + if (flags == 0) /* PP */ + return TERM_CMD_PP; + break; + case 'v': + if (flags == TERM_SEQ_FLAG_SPACE) /* DECSLCK */ + return TERM_CMD_DECSLCK; + else if (flags == TERM_SEQ_FLAG_DQUOTE) /* DECRQDE */ + return TERM_CMD_DECRQDE; + else if (flags == TERM_SEQ_FLAG_CASH) /* DECCRA */ + return TERM_CMD_DECCRA; + else if (flags == TERM_SEQ_FLAG_COMMA) /* DECRPKT */ + return TERM_CMD_DECRPKT; + break; + case 'W': + if (seq->args[0] == 5 && flags == TERM_SEQ_FLAG_WHAT) { + /* DECST8C */ + return TERM_CMD_DECST8C; + } + break; + case 'w': + if (flags == TERM_SEQ_FLAG_CASH) /* DECRQPSR */ + return TERM_CMD_DECRQPSR; + else if (flags == TERM_SEQ_FLAG_SQUOTE) /* DECEFR */ + return TERM_CMD_DECEFR; + else if (flags == TERM_SEQ_FLAG_PLUS) /* DECSPP */ + return TERM_CMD_DECSPP; + break; + case 'X': + if (flags == 0) /* ECH */ + return TERM_CMD_ECH; + break; + case 'x': + if (flags == 0) /* DECREQTPARM */ + return TERM_CMD_DECREQTPARM; + else if (flags == TERM_SEQ_FLAG_CASH) /* DECFRA */ + return TERM_CMD_DECFRA; + else if (flags == TERM_SEQ_FLAG_MULT) /* DECSACE */ + return TERM_CMD_DECSACE; + else if (flags == TERM_SEQ_FLAG_PLUS) /* DECRQPKFM */ + return TERM_CMD_DECRQPKFM; + break; + case 'y': + if (flags == 0) /* DECTST */ + return TERM_CMD_DECTST; + else if (flags == TERM_SEQ_FLAG_MULT) /* DECRQCRA */ + return TERM_CMD_DECRQCRA; + else if (flags == TERM_SEQ_FLAG_PLUS) /* DECPKFMR */ + return TERM_CMD_DECPKFMR; + break; + case 'Z': + if (flags == 0) /* CBT */ + return TERM_CMD_CBT; + break; + case 'z': + if (flags == TERM_SEQ_FLAG_CASH) /* DECERA */ + return TERM_CMD_DECERA; + else if (flags == TERM_SEQ_FLAG_SQUOTE) /* DECELR */ + return TERM_CMD_DECELR; + else if (flags == TERM_SEQ_FLAG_MULT) /* DECINVM */ + return TERM_CMD_DECINVM; + else if (flags == TERM_SEQ_FLAG_PLUS) /* DECPKA */ + return TERM_CMD_DECPKA; + break; + case '@': + if (flags == 0) /* ICH */ + return TERM_CMD_ICH; + break; + case '`': + if (flags == 0) /* HPA */ + return TERM_CMD_HPA; + break; + case '{': + if (flags == TERM_SEQ_FLAG_CASH) /* DECSERA */ + return TERM_CMD_DECSERA; + else if (flags == TERM_SEQ_FLAG_SQUOTE) /* DECSLE */ + return TERM_CMD_DECSLE; + break; + case '|': + if (flags == TERM_SEQ_FLAG_CASH) /* DECSCPP */ + return TERM_CMD_DECSCPP; + else if (flags == TERM_SEQ_FLAG_SQUOTE) /* DECRQLP */ + return TERM_CMD_DECRQLP; + else if (flags == TERM_SEQ_FLAG_MULT) /* DECSNLS */ + return TERM_CMD_DECSNLS; + break; + case '}': + if (flags == TERM_SEQ_FLAG_SPACE) /* DECKBD */ + return TERM_CMD_DECKBD; + else if (flags == TERM_SEQ_FLAG_CASH) /* DECSASD */ + return TERM_CMD_DECSASD; + else if (flags == TERM_SEQ_FLAG_SQUOTE) /* DECIC */ + return TERM_CMD_DECIC; + break; + case '~': + if (flags == TERM_SEQ_FLAG_SPACE) /* DECTME */ + return TERM_CMD_DECTME; + else if (flags == TERM_SEQ_FLAG_CASH) /* DECSSDT */ + return TERM_CMD_DECSSDT; + else if (flags == TERM_SEQ_FLAG_SQUOTE) /* DECDC */ + return TERM_CMD_DECDC; + break; + } + + return TERM_CMD_NONE; +} + +/* + * State Machine + * This parser controls the parser-state and returns any detected sequence to + * the caller. The parser is based on this state-diagram from Paul Williams: + * http://vt100.net/emu/ + * It was written from scratch and extended where needed. + * This parser is fully compatible up to the vt500 series. We expect UCS-4 as + * input. It's the callers responsibility to do any UTF-8 parsing. + */ + +enum parser_state { + STATE_NONE, /* placeholder */ + STATE_GROUND, /* initial state and ground */ + STATE_ESC, /* ESC sequence was started */ + STATE_ESC_INT, /* intermediate escape characters */ + STATE_CSI_ENTRY, /* starting CSI sequence */ + STATE_CSI_PARAM, /* CSI parameters */ + STATE_CSI_INT, /* intermediate CSI characters */ + STATE_CSI_IGNORE, /* CSI error; ignore this CSI sequence */ + STATE_DCS_ENTRY, /* starting DCS sequence */ + STATE_DCS_PARAM, /* DCS parameters */ + STATE_DCS_INT, /* intermediate DCS characters */ + STATE_DCS_PASS, /* DCS data passthrough */ + STATE_DCS_IGNORE, /* DCS error; ignore this DCS sequence */ + STATE_OSC_STRING, /* parsing OSC sequence */ + STATE_ST_IGNORE, /* unimplemented seq; ignore until ST */ + STATE_NUM +}; + +enum parser_action { + ACTION_NONE, /* placeholder */ + ACTION_CLEAR, /* clear parameters */ + ACTION_IGNORE, /* ignore the character entirely */ + ACTION_PRINT, /* print the character on the console */ + ACTION_EXECUTE, /* execute single control character (C0/C1) */ + ACTION_COLLECT, /* collect intermediate character */ + ACTION_PARAM, /* collect parameter character */ + ACTION_ESC_DISPATCH, /* dispatch escape sequence */ + ACTION_CSI_DISPATCH, /* dispatch csi sequence */ + ACTION_DCS_START, /* start of DCS data */ + ACTION_DCS_COLLECT, /* collect DCS data */ + ACTION_DCS_CONSUME, /* consume DCS terminator */ + ACTION_DCS_DISPATCH, /* dispatch dcs sequence */ + ACTION_OSC_START, /* start of OSC data */ + ACTION_OSC_COLLECT, /* collect OSC data */ + ACTION_OSC_CONSUME, /* consume OSC terminator */ + ACTION_OSC_DISPATCH, /* dispatch osc sequence */ + ACTION_NUM +}; + +int term_parser_new(term_parser **out, bool host) { + _term_parser_free_ term_parser *parser = NULL; + + assert_return(out, -EINVAL); + + parser = new0(term_parser, 1); + if (!parser) + return -ENOMEM; + + parser->is_host = host; + parser->st_alloc = 64; + parser->seq.st = new0(char, parser->st_alloc + 1); + if (!parser->seq.st) + return -ENOMEM; + + *out = parser; + parser = NULL; + return 0; +} + +term_parser *term_parser_free(term_parser *parser) { + if (!parser) + return NULL; + + free(parser->seq.st); + free(parser); + return NULL; +} + +static inline void parser_clear(term_parser *parser) { + unsigned int i; + + parser->seq.command = TERM_CMD_NONE; + parser->seq.terminator = 0; + parser->seq.intermediates = 0; + parser->seq.charset = TERM_CHARSET_NONE; + parser->seq.n_args = 0; + for (i = 0; i < TERM_PARSER_ARG_MAX; ++i) + parser->seq.args[i] = -1; + + parser->seq.n_st = 0; + parser->seq.st[0] = 0; +} + +static int parser_ignore(term_parser *parser, uint32_t raw) { + parser_clear(parser); + parser->seq.type = TERM_SEQ_IGNORE; + parser->seq.command = TERM_CMD_NONE; + parser->seq.terminator = raw; + parser->seq.charset = TERM_CHARSET_NONE; + + return parser->seq.type; +} + +static int parser_print(term_parser *parser, uint32_t raw) { + parser_clear(parser); + parser->seq.type = TERM_SEQ_GRAPHIC; + parser->seq.command = TERM_CMD_GRAPHIC; + parser->seq.terminator = raw; + parser->seq.charset = TERM_CHARSET_NONE; + + return parser->seq.type; +} + +static int parser_execute(term_parser *parser, uint32_t raw) { + parser_clear(parser); + parser->seq.type = TERM_SEQ_CONTROL; + parser->seq.command = TERM_CMD_GRAPHIC; + parser->seq.terminator = raw; + parser->seq.charset = TERM_CHARSET_NONE; + if (!parser->is_host) + parser->seq.command = term_parse_host_control(&parser->seq); + + return parser->seq.type; +} + +static void parser_collect(term_parser *parser, uint32_t raw) { + /* + * Usually, characters from 0x30 to 0x3f are only allowed as leading + * markers (or as part of the parameters), characters from 0x20 to 0x2f + * are only allowed as trailing markers. However, our state-machine + * already verifies those restrictions so we can handle them the same + * way here. Note that we safely allow markers to be specified multiple + * times. + */ + + if (raw >= 0x20 && raw <= 0x3f) + parser->seq.intermediates |= 1 << (raw - 0x20); +} + +static void parser_param(term_parser *parser, uint32_t raw) { + int new; + + if (raw == ';') { + if (parser->seq.n_args < TERM_PARSER_ARG_MAX) + ++parser->seq.n_args; + + return; + } + + if (parser->seq.n_args >= TERM_PARSER_ARG_MAX) + return; + + if (raw >= '0' && raw <= '9') { + new = parser->seq.args[parser->seq.n_args]; + if (new < 0) + new = 0; + new = new * 10 + raw - '0'; + + /* VT510 tells us to clamp all values to [0, 9999], however, it + * also allows commands with values up to 2^15-1. We simply use + * 2^16 as maximum here to be compatible to all commands, but + * avoid overflows in any calculations. */ + if (new > 0xffff) + new = 0xffff; + + parser->seq.args[parser->seq.n_args] = new; + } +} + +static int parser_esc(term_parser *parser, uint32_t raw) { + parser->seq.type = TERM_SEQ_ESCAPE; + parser->seq.command = TERM_CMD_NONE; + parser->seq.terminator = raw; + parser->seq.charset = TERM_CHARSET_NONE; + if (!parser->is_host) + parser->seq.command = term_parse_host_escape(&parser->seq, &parser->seq.charset); + + return parser->seq.type; +} + +static int parser_csi(term_parser *parser, uint32_t raw) { + /* parser->seq is cleared during CSI-ENTER state, thus there's no need + * to clear invalid fields here. */ + + if (parser->seq.n_args < TERM_PARSER_ARG_MAX) { + if (parser->seq.n_args > 0 || + parser->seq.args[parser->seq.n_args] >= 0) + ++parser->seq.n_args; + } + + parser->seq.type = TERM_SEQ_CSI; + parser->seq.command = TERM_CMD_NONE; + parser->seq.terminator = raw; + parser->seq.charset = TERM_CHARSET_NONE; + if (!parser->is_host) + parser->seq.command = term_parse_host_csi(&parser->seq); + + return parser->seq.type; +} + +/* perform state transition and dispatch related actions */ +static int parser_transition(term_parser *parser, uint32_t raw, unsigned int state, unsigned int action) { + if (state != STATE_NONE) + parser->state = state; + + switch (action) { + case ACTION_NONE: + return TERM_SEQ_NONE; + case ACTION_CLEAR: + parser_clear(parser); + return TERM_SEQ_NONE; + case ACTION_IGNORE: + return parser_ignore(parser, raw); + case ACTION_PRINT: + return parser_print(parser, raw); + case ACTION_EXECUTE: + return parser_execute(parser, raw); + case ACTION_COLLECT: + parser_collect(parser, raw); + return TERM_SEQ_NONE; + case ACTION_PARAM: + parser_param(parser, raw); + return TERM_SEQ_NONE; + case ACTION_ESC_DISPATCH: + return parser_esc(parser, raw); + case ACTION_CSI_DISPATCH: + return parser_csi(parser, raw); + case ACTION_DCS_START: + /* not implemented */ + return TERM_SEQ_NONE; + case ACTION_DCS_COLLECT: + /* not implemented */ + return TERM_SEQ_NONE; + case ACTION_DCS_CONSUME: + /* not implemented */ + return TERM_SEQ_NONE; + case ACTION_DCS_DISPATCH: + /* not implemented */ + return TERM_SEQ_NONE; + case ACTION_OSC_START: + /* not implemented */ + return TERM_SEQ_NONE; + case ACTION_OSC_COLLECT: + /* not implemented */ + return TERM_SEQ_NONE; + case ACTION_OSC_CONSUME: + /* not implemented */ + return TERM_SEQ_NONE; + case ACTION_OSC_DISPATCH: + /* not implemented */ + return TERM_SEQ_NONE; + default: + assert_not_reached("invalid vte-parser action"); + return TERM_SEQ_NONE; + } +} + +static int parser_feed_to_state(term_parser *parser, uint32_t raw) { + switch (parser->state) { + case STATE_NONE: + /* + * During initialization, parser->state is cleared. Treat this + * as STATE_GROUND. We will then never get to STATE_NONE again. + */ + case STATE_GROUND: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + case 0x80 ... 0x9b: /* C1 \ { ST } */ + case 0x9d ... 0x9f: + return parser_transition(parser, raw, STATE_NONE, ACTION_EXECUTE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_NONE, ACTION_PRINT); + case STATE_ESC: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_EXECUTE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_ESC_INT, ACTION_COLLECT); + case 0x30 ... 0x4f: /* ['0' - '~'] \ { 'P', 'X', '[', ']', '^', '_' } */ + case 0x51 ... 0x57: + case 0x59 ... 0x5a: + case 0x5c: + case 0x60 ... 0x7e: + return parser_transition(parser, raw, STATE_GROUND, ACTION_ESC_DISPATCH); + case 0x50: /* 'P' */ + return parser_transition(parser, raw, STATE_DCS_ENTRY, ACTION_CLEAR); + case 0x5b: /* '[' */ + return parser_transition(parser, raw, STATE_CSI_ENTRY, ACTION_CLEAR); + case 0x5d: /* ']' */ + return parser_transition(parser, raw, STATE_OSC_STRING, ACTION_CLEAR); + case 0x58: /* 'X' */ + case 0x5e: /* '^' */ + case 0x5f: /* '_' */ + return parser_transition(parser, raw, STATE_ST_IGNORE, ACTION_NONE); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_ESC_INT, ACTION_COLLECT); + case STATE_ESC_INT: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_EXECUTE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_NONE, ACTION_COLLECT); + case 0x30 ... 0x7e: /* ['0' - '~'] */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_ESC_DISPATCH); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_NONE, ACTION_COLLECT); + case STATE_CSI_ENTRY: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_EXECUTE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_CSI_INT, ACTION_COLLECT); + case 0x3a: /* ':' */ + return parser_transition(parser, raw, STATE_CSI_IGNORE, ACTION_NONE); + case 0x30 ... 0x39: /* ['0' - '9'] */ + case 0x3b: /* ';' */ + return parser_transition(parser, raw, STATE_CSI_PARAM, ACTION_PARAM); + case 0x3c ... 0x3f: /* ['<' - '?'] */ + return parser_transition(parser, raw, STATE_CSI_PARAM, ACTION_COLLECT); + case 0x40 ... 0x7e: /* ['@' - '~'] */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_CSI_DISPATCH); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_CSI_IGNORE, ACTION_NONE); + case STATE_CSI_PARAM: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_EXECUTE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_CSI_INT, ACTION_COLLECT); + case 0x30 ... 0x39: /* ['0' - '9'] */ + case 0x3b: /* ';' */ + return parser_transition(parser, raw, STATE_NONE, ACTION_PARAM); + case 0x3a: /* ':' */ + case 0x3c ... 0x3f: /* ['<' - '?'] */ + return parser_transition(parser, raw, STATE_CSI_IGNORE, ACTION_NONE); + case 0x40 ... 0x7e: /* ['@' - '~'] */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_CSI_DISPATCH); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_CSI_IGNORE, ACTION_NONE); + case STATE_CSI_INT: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_EXECUTE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_NONE, ACTION_COLLECT); + case 0x30 ... 0x3f: /* ['0' - '?'] */ + return parser_transition(parser, raw, STATE_CSI_IGNORE, ACTION_NONE); + case 0x40 ... 0x7e: /* ['@' - '~'] */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_CSI_DISPATCH); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_CSI_IGNORE, ACTION_NONE); + case STATE_CSI_IGNORE: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_EXECUTE); + case 0x20 ... 0x3f: /* [' ' - '?'] */ + return parser_transition(parser, raw, STATE_NONE, ACTION_NONE); + case 0x40 ... 0x7e: /* ['@' - '~'] */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_NONE); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_NONE, ACTION_NONE); + case STATE_DCS_ENTRY: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_DCS_INT, ACTION_COLLECT); + case 0x3a: /* ':' */ + return parser_transition(parser, raw, STATE_DCS_IGNORE, ACTION_NONE); + case 0x30 ... 0x39: /* ['0' - '9'] */ + case 0x3b: /* ';' */ + return parser_transition(parser, raw, STATE_DCS_PARAM, ACTION_PARAM); + case 0x3c ... 0x3f: /* ['<' - '?'] */ + return parser_transition(parser, raw, STATE_DCS_PARAM, ACTION_COLLECT); + case 0x40 ... 0x7e: /* ['@' - '~'] */ + return parser_transition(parser, raw, STATE_DCS_PASS, ACTION_DCS_CONSUME); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_DCS_PASS, ACTION_DCS_CONSUME); + case STATE_DCS_PARAM: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_DCS_INT, ACTION_COLLECT); + case 0x30 ... 0x39: /* ['0' - '9'] */ + case 0x3b: /* ';' */ + return parser_transition(parser, raw, STATE_NONE, ACTION_PARAM); + case 0x3a: /* ':' */ + case 0x3c ... 0x3f: /* ['<' - '?'] */ + return parser_transition(parser, raw, STATE_DCS_IGNORE, ACTION_NONE); + case 0x40 ... 0x7e: /* ['@' - '~'] */ + return parser_transition(parser, raw, STATE_DCS_PASS, ACTION_DCS_CONSUME); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_DCS_PASS, ACTION_DCS_CONSUME); + case STATE_DCS_INT: + switch (raw) { + case 0x00 ... 0x1f: /* C0 */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x20 ... 0x2f: /* [' ' - '\'] */ + return parser_transition(parser, raw, STATE_NONE, ACTION_COLLECT); + case 0x30 ... 0x3f: /* ['0' - '?'] */ + return parser_transition(parser, raw, STATE_DCS_IGNORE, ACTION_NONE); + case 0x40 ... 0x7e: /* ['@' - '~'] */ + return parser_transition(parser, raw, STATE_DCS_PASS, ACTION_DCS_CONSUME); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_DCS_PASS, ACTION_DCS_CONSUME); + case STATE_DCS_PASS: + switch (raw) { + case 0x00 ... 0x7e: /* ASCII \ { DEL } */ + return parser_transition(parser, raw, STATE_NONE, ACTION_DCS_COLLECT); + case 0x7f: /* DEL */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_DCS_DISPATCH); + } + + return parser_transition(parser, raw, STATE_NONE, ACTION_DCS_COLLECT); + case STATE_DCS_IGNORE: + switch (raw) { + case 0x00 ... 0x7f: /* ASCII */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_NONE); + } + + return parser_transition(parser, raw, STATE_NONE, ACTION_NONE); + case STATE_OSC_STRING: + switch (raw) { + case 0x00 ... 0x06: /* C0 \ { BEL } */ + case 0x08 ... 0x1f: + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x20 ... 0x7f: /* [' ' - DEL] */ + return parser_transition(parser, raw, STATE_NONE, ACTION_OSC_COLLECT); + case 0x07: /* BEL */ + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_OSC_DISPATCH); + } + + return parser_transition(parser, raw, STATE_NONE, ACTION_OSC_COLLECT); + case STATE_ST_IGNORE: + switch (raw) { + case 0x00 ... 0x7f: /* ASCII */ + return parser_transition(parser, raw, STATE_NONE, ACTION_IGNORE); + case 0x9c: /* ST */ + return parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + } + + return parser_transition(parser, raw, STATE_NONE, ACTION_NONE); + } + + assert_not_reached("bad vte-parser state"); + return -EINVAL; +} + +int term_parser_feed(term_parser *parser, const term_seq **seq_out, uint32_t raw) { + int r; + + assert_return(parser, -EINVAL); + assert_return(seq_out, -EINVAL); + + /* + * Notes: + * * DEC treats GR codes as GL. We don't do that as we require UTF-8 + * as charset and, thus, it doesn't make sense to treat GR special. + * * During control sequences, unexpected C1 codes cancel the sequence + * and immediately start a new one. C0 codes, however, may or may not + * be ignored/executed depending on the sequence. + */ + + switch (raw) { + case 0x18: /* CAN */ + r = parser_transition(parser, raw, STATE_GROUND, ACTION_IGNORE); + break; + case 0x1a: /* SUB */ + r = parser_transition(parser, raw, STATE_GROUND, ACTION_EXECUTE); + break; + case 0x80 ... 0x8f: /* C1 \ {DCS, SOS, CSI, ST, OSC, PM, APC} */ + case 0x91 ... 0x97: + case 0x99 ... 0x9a: + r = parser_transition(parser, raw, STATE_GROUND, ACTION_EXECUTE); + break; + case 0x1b: /* ESC */ + r = parser_transition(parser, raw, STATE_ESC, ACTION_CLEAR); + break; + case 0x98: /* SOS */ + case 0x9e: /* PM */ + case 0x9f: /* APC */ + r = parser_transition(parser, raw, STATE_ST_IGNORE, ACTION_NONE); + break; + case 0x90: /* DCS */ + r = parser_transition(parser, raw, STATE_DCS_ENTRY, ACTION_CLEAR); + break; + case 0x9d: /* OSC */ + r = parser_transition(parser, raw, STATE_OSC_STRING, ACTION_CLEAR); + break; + case 0x9b: /* CSI */ + r = parser_transition(parser, raw, STATE_CSI_ENTRY, ACTION_CLEAR); + break; + default: + r = parser_feed_to_state(parser, raw); + break; + } + + if (r <= 0) + *seq_out = NULL; + else + *seq_out = &parser->seq; + + return r; +} diff --git a/src/libsystemd-terminal/test-term-parser.c b/src/libsystemd-terminal/test-term-parser.c new file mode 100644 index 0000000000..ed16f5f276 --- /dev/null +++ b/src/libsystemd-terminal/test-term-parser.c @@ -0,0 +1,143 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ +/*** + This file is part of systemd. + + Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com> + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +/* + * Terminal Parser Tests + */ + +#include <assert.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "macro.h" +#include "term-internal.h" +#include "util.h" + +static void test_term_utf8_invalid(void) { + term_utf8 p = { }; + const uint32_t *res; + size_t len; + + res = term_utf8_decode(NULL, NULL, 0); + assert_se(res == NULL); + + res = term_utf8_decode(&p, NULL, 0); + assert_se(res != NULL); + + len = 5; + res = term_utf8_decode(NULL, &len, 0); + assert_se(res == NULL); + assert_se(len == 0); + + len = 5; + res = term_utf8_decode(&p, &len, 0); + assert_se(res != NULL); + assert_se(len == 1); + + len = 5; + res = term_utf8_decode(&p, &len, 0xCf); + assert_se(res == NULL); + assert_se(len == 0); + + len = 5; + res = term_utf8_decode(&p, &len, 0x0); + assert_se(res != NULL); + assert_se(len == 2); +} + +static void test_term_utf8_range(void) { + term_utf8 p = { }; + const uint32_t *res; + char u8[4]; + uint32_t i, j; + size_t ulen, len; + + /* Convert all ucs-4 chars to utf-8 and back */ + + for (i = 0; i < 0x10FFFF; ++i) { + ulen = term_utf8_encode(u8, i); + if (!ulen) + continue; + + for (j = 0; j < ulen; ++j) { + res = term_utf8_decode(&p, &len, u8[j]); + if (!res) { + assert_se(j + 1 != ulen); + continue; + } + + assert_se(j + 1 == ulen); + assert_se(len == 1 && *res == i); + assert_se(i <= 127 || ulen >= 2); + } + } +} + +static void test_term_utf8_mix(void) { + static const char source[] = { + 0x00, /* normal 0 */ + 0xC0, 0x80, /* overlong 0 */ + 0xC0, 0x81, /* overlong 1 */ + 0xE0, 0x80, 0x81, /* overlong 1 */ + 0xF0, 0x80, 0x80, 0x81, /* overlong 1 */ + 0xC0, 0x00, /* invalid continuation */ + 0xC0, 0xC0, 0x81, /* invalid continuation with a following overlong 1 */ + 0xF8, 0x80, 0x80, 0x80, 0x81, /* overlong 1 with 5 bytes */ + 0xE0, 0x80, 0xC0, 0x81, /* invalid 3-byte followed by valid 2-byte */ + 0xF0, 0x80, 0x80, 0xC0, 0x81, /* invalid 4-byte followed by valid 2-byte */ + }; + static const uint32_t result[] = { + 0x0000, + 0x0000, + 0x0001, + 0x0001, + 0x0001, + 0x00C0, 0x0000, + 0x00C0, 0x0001, + 0x00F8, 0x0080, 0x0080, 0x0080, 0x0081, + 0x00E0, 0x0080, 0x0001, + 0x00F0, 0x0080, 0x0080, 0x0001, + }; + term_utf8 p = { }; + const uint32_t *res; + unsigned int i, j; + size_t len; + + for (i = 0, j = 0; i < sizeof(source); ++i) { + res = term_utf8_decode(&p, &len, source[i]); + if (!res) + continue; + + assert_se(j + len <= ELEMENTSOF(result)); + assert_se(!memcmp(res, &result[j], sizeof(uint32_t) * len)); + j += len; + } + + assert_se(j == ELEMENTSOF(result)); +} + +int main(int argc, char *argv[]) { + test_term_utf8_invalid(); + test_term_utf8_range(); + test_term_utf8_mix(); + + return 0; +} |