diff options
author | Lennart Poettering <lennart@poettering.net> | 2015-04-10 11:27:47 +0200 |
---|---|---|
committer | Lennart Poettering <lennart@poettering.net> | 2015-04-10 11:28:34 +0200 |
commit | f3ee629711783333005c41e21d66841268b80f70 (patch) | |
tree | ba8b7e109c433d85d4b3220be86309afbaa1caf6 | |
parent | 96406c1a278bfd1c1b69a248cc6c223755077f02 (diff) |
util: when unescaping C escape sequences support C++11 \u and \U unicode literals
We simply recode them in utf8.
-rw-r--r-- | src/shared/utf8.c | 13 | ||||
-rw-r--r-- | src/shared/utf8.h | 2 | ||||
-rw-r--r-- | src/shared/util.c | 146 | ||||
-rw-r--r-- | src/test/test-util.c | 9 |
4 files changed, 144 insertions, 26 deletions
diff --git a/src/shared/utf8.c b/src/shared/utf8.c index 013c110f07..800884ffee 100644 --- a/src/shared/utf8.c +++ b/src/shared/utf8.c @@ -52,7 +52,7 @@ #include "utf8.h" #include "util.h" -static inline bool is_unicode_valid(uint32_t ch) { +bool unichar_is_valid(uint32_t ch) { if (ch >= 0x110000) /* End of unicode space */ return false; @@ -66,7 +66,7 @@ static inline bool is_unicode_valid(uint32_t ch) { return true; } -static bool is_unicode_control(uint32_t ch) { +static bool unichar_is_control(uint32_t ch) { /* 0 to ' '-1 is the C0 range. @@ -156,7 +156,7 @@ bool utf8_is_printable_newline(const char* str, size_t length, bool newline) { val = utf8_encoded_to_unichar(p); if (val < 0 || - is_unicode_control(val) || + unichar_is_control(val) || (!newline && val == '\n')) return false; @@ -276,6 +276,7 @@ char *ascii_is_valid(const char *str) { * occupy. */ size_t utf8_encode_unichar(char *out_utf8, uint32_t g) { + if (g < (1 << 7)) { if (out_utf8) out_utf8[0] = g & 0x7f; @@ -301,9 +302,9 @@ size_t utf8_encode_unichar(char *out_utf8, uint32_t g) { out_utf8[3] = 0x80 | (g & 0x3f); } return 4; - } else { - return 0; } + + return 0; } char *utf16_to_utf8(const void *s, size_t length) { @@ -394,7 +395,7 @@ int utf8_encoded_valid_unichar(const char *str) { return -EINVAL; /* check if value has valid range */ - if (!is_unicode_valid(unichar)) + if (!unichar_is_valid(unichar)) return -EINVAL; return len; diff --git a/src/shared/utf8.h b/src/shared/utf8.h index 77f663438e..e745649f06 100644 --- a/src/shared/utf8.h +++ b/src/shared/utf8.h @@ -27,6 +27,8 @@ #define UTF8_REPLACEMENT_CHARACTER "\xef\xbf\xbd" +bool unichar_is_valid(uint32_t c); + const char *utf8_is_valid(const char *s) _pure_; char *ascii_is_valid(const char *s) _pure_; diff --git a/src/shared/util.c b/src/shared/util.c index 7175e808b3..2385224fbb 100644 --- a/src/shared/util.c +++ b/src/shared/util.c @@ -1347,13 +1347,17 @@ char *cescape(const char *s) { return r; } -static int cunescape_one(const char *p, size_t length, char *ret) { +static int cunescape_one(const char *p, size_t length, char *ret, uint32_t *ret_unicode) { int r = 1; assert(p); assert(*p); assert(ret); + /* Unescapes C style. Returns the unescaped character in ret, + * unless we encountered a \u sequence in which case the full + * unicode character is returned in ret_unicode, instead. */ + if (length != (size_t) -1 && length < 1) return -EINVAL; @@ -1410,15 +1414,92 @@ static int cunescape_one(const char *p, size_t length, char *ret) { if (b < 0) return -EINVAL; - /* don't allow NUL bytes */ + /* Don't allow NUL bytes */ if (a == 0 && b == 0) return -EINVAL; - *ret = (char) ((a << 4) | b); + *ret = (char) ((a << 4U) | b); r = 3; break; } + case 'u': { + /* C++11 style 16bit unicode */ + + int a[4]; + unsigned i; + uint32_t c; + + if (length != (size_t) -1 && length < 5) + return -EINVAL; + + for (i = 0; i < 4; i++) { + a[i] = unhexchar(p[1 + i]); + if (a[i] < 0) + return a[i]; + } + + c = ((uint32_t) a[0] << 12U) | ((uint32_t) a[1] << 8U) | ((uint32_t) a[2] << 4U) | (uint32_t) a[3]; + + /* Don't allow 0 chars */ + if (c == 0) + return -EINVAL; + + if (c < 128) + *ret = c; + else { + if (!ret_unicode) + return -EINVAL; + + *ret = 0; + *ret_unicode = c; + } + + r = 5; + break; + } + + case 'U': { + /* C++11 style 32bit unicode */ + + int a[8]; + unsigned i; + uint32_t c; + + if (length != (size_t) -1 && length < 9) + return -EINVAL; + + for (i = 0; i < 8; i++) { + a[i] = unhexchar(p[1 + i]); + if (a[i] < 0) + return a[i]; + } + + c = ((uint32_t) a[0] << 28U) | ((uint32_t) a[1] << 24U) | ((uint32_t) a[2] << 20U) | ((uint32_t) a[3] << 16U) | + ((uint32_t) a[4] << 12U) | ((uint32_t) a[5] << 8U) | ((uint32_t) a[6] << 4U) | (uint32_t) a[7]; + + /* Don't allow 0 chars */ + if (c == 0) + return -EINVAL; + + /* Don't allow invalid code points */ + if (!unichar_is_valid(c)) + return -EINVAL; + + if (c < 128) + *ret = c; + else { + if (!ret_unicode) + return -EINVAL; + + *ret = 0; + *ret_unicode = c; + } + + r = 9; + break; + } + case '0': case '1': case '2': @@ -1428,7 +1509,8 @@ static int cunescape_one(const char *p, size_t length, char *ret) { case '6': case '7': { /* octal encoding */ - int a, b, c, m; + int a, b, c; + uint32_t m; if (length != (size_t) -1 && length < 4) return -EINVAL; @@ -1450,11 +1532,11 @@ static int cunescape_one(const char *p, size_t length, char *ret) { return -EINVAL; /* Don't allow bytes above 255 */ - m = (a << 6) | (b << 3) | c; + m = ((uint32_t) a << 6U) | ((uint32_t) b << 3U) | (uint32_t) c; if (m > 255) return -EINVAL; - *ret = (char) m; + *ret = m; r = 3; break; } @@ -1487,6 +1569,8 @@ int cunescape_length_with_prefix(const char *s, size_t length, const char *prefi for (f = s, t = r + pl; f < s + length; f++) { size_t remaining; + uint32_t u; + char c; int k; remaining = s + length - f; @@ -1509,7 +1593,7 @@ int cunescape_length_with_prefix(const char *s, size_t length, const char *prefi return -EINVAL; } - k = cunescape_one(f + 1, remaining - 1, t); + k = cunescape_one(f + 1, remaining - 1, &c, &u); if (k < 0) { if (flags & UNESCAPE_RELAX) { /* Invalid escape code, let's take it literal then */ @@ -1521,8 +1605,14 @@ int cunescape_length_with_prefix(const char *s, size_t length, const char *prefi return k; } + if (c != 0) + /* Non-Unicode? Let's encode this directly */ + *(t++) = c; + else + /* Unicode? Then let's encode this in UTF-8 */ + t += utf8_encode_unichar(t, u); + f += k; - t++; } *t = 0; @@ -7198,16 +7288,22 @@ int unquote_first_word(const char **p, char **ret, UnquoteFlags flags) { return -ENOMEM; if (flags & UNQUOTE_CUNESCAPE) { - r = cunescape_one(*p, (size_t) -1, &c); + uint32_t u; + + r = cunescape_one(*p, (size_t) -1, &c, &u); if (r < 0) return -EINVAL; (*p) += r - 1; - } - s[sz++] = c; - state = VALUE; + if (c != 0) + s[sz++] = c; /* normal explicit char */ + else + sz += utf8_encode_unichar(s, u); /* unicode chars we'll encode as utf8 */ + } else + s[sz++] = c; + state = VALUE; break; case SINGLE_QUOTE: @@ -7239,14 +7335,21 @@ int unquote_first_word(const char **p, char **ret, UnquoteFlags flags) { return -ENOMEM; if (flags & UNQUOTE_CUNESCAPE) { - r = cunescape_one(*p, (size_t) -1, &c); + uint32_t u; + + r = cunescape_one(*p, (size_t) -1, &c, &u); if (r < 0) return -EINVAL; (*p) += r - 1; - } - s[sz++] = c; + if (c != 0) + s[sz++] = c; + else + sz += utf8_encode_unichar(s, u); + } else + s[sz++] = c; + state = SINGLE_QUOTE; break; @@ -7277,14 +7380,21 @@ int unquote_first_word(const char **p, char **ret, UnquoteFlags flags) { return -ENOMEM; if (flags & UNQUOTE_CUNESCAPE) { - r = cunescape_one(*p, (size_t) -1, &c); + uint32_t u; + + r = cunescape_one(*p, (size_t) -1, &c, &u); if (r < 0) return -EINVAL; (*p) += r - 1; - } - s[sz++] = c; + if (c != 0) + s[sz++] = c; + else + sz += utf8_encode_unichar(s, u); + } else + s[sz++] = c; + state = DOUBLE_QUOTE; break; diff --git a/src/test/test-util.c b/src/test/test-util.c index f1403fa393..7af41bcb18 100644 --- a/src/test/test-util.c +++ b/src/test/test-util.c @@ -416,11 +416,10 @@ static void test_cescape(void) { static void test_cunescape(void) { _cleanup_free_ char *unescaped; - const char *x = "abc\\\"\b\f\a\n\r\t\v\003\177\234\313\\000\\x00"; assert_se(cunescape("abc\\\\\\\"\\b\\f\\a\\n\\r\\t\\v\\003\\177\\234\\313\\000\\x00", 0, &unescaped) < 0); assert_se(cunescape("abc\\\\\\\"\\b\\f\\a\\n\\r\\t\\v\\003\\177\\234\\313\\000\\x00", UNESCAPE_RELAX, &unescaped) >= 0); - assert_se(streq_ptr(unescaped, x)); + assert_se(streq_ptr(unescaped, "abc\\\"\b\f\a\n\r\t\v\003\177\234\313\\000\\x00")); free(unescaped); unescaped = NULL; @@ -452,6 +451,12 @@ static void test_cunescape(void) { assert_se(cunescape("\\1", 0, &unescaped) < 0); assert_se(cunescape("\\1", UNESCAPE_RELAX, &unescaped) >= 0); assert_se(streq_ptr(unescaped, "\\1")); + free(unescaped); + unescaped = NULL; + + assert_se(cunescape("\\u0000", 0, &unescaped) < 0); + assert_se(cunescape("\\u00DF\\U000000df\\u03a0\\U00000041", UNESCAPE_RELAX, &unescaped) >= 0); + assert_se(streq_ptr(unescaped, "ßßΠA")); } static void test_foreach_word(void) { |