diff options
author | Dave Reisner <dreisner@archlinux.org> | 2013-09-17 15:39:09 -0400 |
---|---|---|
committer | Dave Reisner <dreisner@archlinux.org> | 2013-09-17 16:31:32 -0400 |
commit | 02a36bc9a1769c744f141f127898dbe076dbfd96 (patch) | |
tree | d6ef940458ee7a79e8a03ddba80735fd76512a1d /src | |
parent | bf24e638afb32ebc49b063c007888410fb9d19bd (diff) |
move utf8 functions from libudev-private.h to utf8.h
There's now some more obvious overlap amongst the two utf8 validation
functions, but no more than there already was previously.
This also adds some menial tests for anyone who wants to do more
merging of these two in the future.
Diffstat (limited to 'src')
-rw-r--r-- | src/libudev/libudev-util.c | 171 | ||||
-rw-r--r-- | src/shared/utf8.c | 151 | ||||
-rw-r--r-- | src/shared/utf8.h | 4 | ||||
-rw-r--r-- | src/test/test-utf8.c | 59 |
4 files changed, 217 insertions, 168 deletions
diff --git a/src/libudev/libudev-util.c b/src/libudev/libudev-util.c index 714dc50ae9..d54430cad0 100644 --- a/src/libudev/libudev-util.c +++ b/src/libudev/libudev-util.c @@ -34,6 +34,7 @@ #include "libudev.h" #include "libudev-private.h" +#include "utf8.h" /** * SECTION:libudev-util @@ -306,129 +307,6 @@ void util_remove_trailing_chars(char *path, char c) path[--len] = '\0'; } -/* count of characters used to encode one unicode char */ -static int utf8_encoded_expected_len(const char *str) -{ - unsigned char c = (unsigned char)str[0]; - - if (c < 0x80) - return 1; - if ((c & 0xe0) == 0xc0) - return 2; - if ((c & 0xf0) == 0xe0) - return 3; - if ((c & 0xf8) == 0xf0) - return 4; - if ((c & 0xfc) == 0xf8) - return 5; - if ((c & 0xfe) == 0xfc) - return 6; - return 0; -} - -/* decode one unicode char */ -static int utf8_encoded_to_unichar(const char *str) -{ - int unichar; - int len; - int i; - - len = utf8_encoded_expected_len(str); - switch (len) { - case 1: - return (int)str[0]; - case 2: - unichar = str[0] & 0x1f; - break; - case 3: - unichar = (int)str[0] & 0x0f; - break; - case 4: - unichar = (int)str[0] & 0x07; - break; - case 5: - unichar = (int)str[0] & 0x03; - break; - case 6: - unichar = (int)str[0] & 0x01; - break; - default: - return -1; - } - - for (i = 1; i < len; i++) { - if (((int)str[i] & 0xc0) != 0x80) - return -1; - unichar <<= 6; - unichar |= (int)str[i] & 0x3f; - } - - return unichar; -} - -/* expected size used to encode one unicode char */ -static int utf8_unichar_to_encoded_len(int unichar) -{ - if (unichar < 0x80) - return 1; - if (unichar < 0x800) - return 2; - if (unichar < 0x10000) - return 3; - if (unichar < 0x200000) - return 4; - if (unichar < 0x4000000) - return 5; - return 6; -} - -/* check if unicode char has a valid numeric range */ -static int utf8_unichar_valid_range(int unichar) -{ - if (unichar > 0x10ffff) - return 0; - if ((unichar & 0xfffff800) == 0xd800) - return 0; - if ((unichar > 0xfdcf) && (unichar < 0xfdf0)) - return 0; - if ((unichar & 0xffff) == 0xffff) - return 0; - return 1; -} - -/* validate one encoded unicode char and return its length */ -static int utf8_encoded_valid_unichar(const char *str) -{ - int len; - int unichar; - int i; - - len = utf8_encoded_expected_len(str); - if (len == 0) - return -1; - - /* ascii is valid */ - if (len == 1) - return 1; - - /* check if expected encoded chars are available */ - for (i = 0; i < len; i++) - if ((str[i] & 0x80) != 0x80) - return -1; - - unichar = utf8_encoded_to_unichar(str); - - /* check if encoded length matches encoded value */ - if (utf8_unichar_to_encoded_len(unichar) != len) - return -1; - - /* check if value has valid range */ - if (!utf8_unichar_valid_range(unichar)) - return -1; - - return len; -} - int util_replace_whitespace(const char *str, char *to, size_t len) { size_t i, j; @@ -457,17 +335,6 @@ int util_replace_whitespace(const char *str, char *to, size_t len) return 0; } -static int is_whitelisted(char c, const char *white) -{ - if ((c >= '0' && c <= '9') || - (c >= 'A' && c <= 'Z') || - (c >= 'a' && c <= 'z') || - strchr("#+-.:=@_", c) != NULL || - (white != NULL && strchr(white, c) != NULL)) - return 1; - return 0; -} - /* allow chars in whitelist, plain ascii, hex-escaping and valid utf8 */ int util_replace_chars(char *str, const char *white) { @@ -477,7 +344,7 @@ int util_replace_chars(char *str, const char *white) while (str[i] != '\0') { int len; - if (is_whitelisted(str[i], white)) { + if (is_utf8_encoding_whitelisted(str[i], white)) { i++; continue; } @@ -525,39 +392,7 @@ int util_replace_chars(char *str, const char *white) **/ _public_ int udev_util_encode_string(const char *str, char *str_enc, size_t len) { - size_t i, j; - - if (str == NULL || str_enc == NULL) - return -1; - - for (i = 0, j = 0; str[i] != '\0'; i++) { - int seqlen; - - seqlen = utf8_encoded_valid_unichar(&str[i]); - if (seqlen > 1) { - if (len-j < (size_t)seqlen) - goto err; - memcpy(&str_enc[j], &str[i], seqlen); - j += seqlen; - i += (seqlen-1); - } else if (str[i] == '\\' || !is_whitelisted(str[i], NULL)) { - if (len-j < 4) - goto err; - sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]); - j += 4; - } else { - if (len-j < 1) - goto err; - str_enc[j] = str[i]; - j++; - } - } - if (len-j < 1) - goto err; - str_enc[j] = '\0'; - return 0; -err: - return -1; + return udev_encode_string(str, str_enc, len); } /* diff --git a/src/shared/utf8.c b/src/shared/utf8.c index 655cc771d4..1a68394a53 100644 --- a/src/shared/utf8.c +++ b/src/shared/utf8.c @@ -317,3 +317,154 @@ char *utf16_to_utf8(const void *s, size_t length) { return r; } + +/* count of characters used to encode one unicode char */ +static int utf8_encoded_expected_len(const char *str) { + unsigned char c = (unsigned char)str[0]; + + if (c < 0x80) + return 1; + if ((c & 0xe0) == 0xc0) + return 2; + if ((c & 0xf0) == 0xe0) + return 3; + if ((c & 0xf8) == 0xf0) + return 4; + if ((c & 0xfc) == 0xf8) + return 5; + if ((c & 0xfe) == 0xfc) + return 6; + return 0; +} + +/* decode one unicode char */ +static int utf8_encoded_to_unichar(const char *str) { + int unichar; + int len; + int i; + + len = utf8_encoded_expected_len(str); + switch (len) { + case 1: + return (int)str[0]; + case 2: + unichar = str[0] & 0x1f; + break; + case 3: + unichar = (int)str[0] & 0x0f; + break; + case 4: + unichar = (int)str[0] & 0x07; + break; + case 5: + unichar = (int)str[0] & 0x03; + break; + case 6: + unichar = (int)str[0] & 0x01; + break; + default: + return -1; + } + + for (i = 1; i < len; i++) { + if (((int)str[i] & 0xc0) != 0x80) + return -1; + unichar <<= 6; + unichar |= (int)str[i] & 0x3f; + } + + return unichar; +} + +/* expected size used to encode one unicode char */ +static int utf8_unichar_to_encoded_len(int unichar) { + if (unichar < 0x80) + return 1; + if (unichar < 0x800) + return 2; + if (unichar < 0x10000) + return 3; + if (unichar < 0x200000) + return 4; + if (unichar < 0x4000000) + return 5; + return 6; +} + +/* validate one encoded unicode char and return its length */ +int utf8_encoded_valid_unichar(const char *str) { + int len; + int unichar; + int i; + + len = utf8_encoded_expected_len(str); + if (len == 0) + return -1; + + /* ascii is valid */ + if (len == 1) + return 1; + + /* check if expected encoded chars are available */ + for (i = 0; i < len; i++) + if ((str[i] & 0x80) != 0x80) + return -1; + + unichar = utf8_encoded_to_unichar(str); + + /* check if encoded length matches encoded value */ + if (utf8_unichar_to_encoded_len(unichar) != len) + return -1; + + /* check if value has valid range */ + if (!is_unicode_valid(unichar)) + return -1; + + return len; +} + +int is_utf8_encoding_whitelisted(char c, const char *white) { + if ((c >= '0' && c <= '9') || + (c >= 'A' && c <= 'Z') || + (c >= 'a' && c <= 'z') || + strchr("#+-.:=@_", c) != NULL || + (white != NULL && strchr(white, c) != NULL)) + return 1; + return 0; +} + +int udev_encode_string(const char *str, char *str_enc, size_t len) { + size_t i, j; + + if (str == NULL || str_enc == NULL) + return -1; + + for (i = 0, j = 0; str[i] != '\0'; i++) { + int seqlen; + + seqlen = utf8_encoded_valid_unichar(&str[i]); + if (seqlen > 1) { + if (len-j < (size_t)seqlen) + goto err; + memcpy(&str_enc[j], &str[i], seqlen); + j += seqlen; + i += (seqlen-1); + } else if (str[i] == '\\' || !is_utf8_encoding_whitelisted(str[i], NULL)) { + if (len-j < 4) + goto err; + sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]); + j += 4; + } else { + if (len-j < 1) + goto err; + str_enc[j] = str[i]; + j++; + } + } + if (len-j < 1) + goto err; + str_enc[j] = '\0'; + return 0; +err: + return -1; +} diff --git a/src/shared/utf8.h b/src/shared/utf8.h index f805ea6b59..7a5608c9ee 100644 --- a/src/shared/utf8.h +++ b/src/shared/utf8.h @@ -34,3 +34,7 @@ char *utf8_filter(const char *s); char *ascii_filter(const char *s); char *utf16_to_utf8(const void *s, size_t length); + +int utf8_encoded_valid_unichar(const char *str); +int is_utf8_encoding_whitelisted(char c, const char *white); +int udev_encode_string(const char *str, char *str_enc, size_t len); diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c new file mode 100644 index 0000000000..d2b9771f4b --- /dev/null +++ b/src/test/test-utf8.c @@ -0,0 +1,59 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2013 Dave Reisner + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + + +#include "utf8.h" +#include "util.h" + +/* helpers for test_udev_encode_string */ +static char *do_encode_string(const char *in) { + size_t out_len = strlen(in) * 4; + char *out = malloc(out_len); + + assert_se(out); + assert_se(udev_encode_string(in, out, out_len) >= 0); + puts(out); + + return out; +} + +static bool expect_encoded_as(const char *in, const char *expected) { + _cleanup_free_ char *encoded = do_encode_string(in); + return streq(encoded, expected); +} + +static void test_udev_encode_string(void) { + assert_se(expect_encoded_as("systemd sucks", "systemd\\x20sucks")); + assert_se(expect_encoded_as("pinkiepie", "pinkiepie")); + assert_se(expect_encoded_as("valíd\\ųtf8", "valíd\\x5cųtf8")); + assert_se(expect_encoded_as("s/ash/ng", "s\\x2fash\\x2fng")); +} + +static void test_utf8_is_valid(void) { + assert_se(utf8_is_valid("ascii is valid unicode")); + assert_se(utf8_is_valid("\341\204\242")); + assert_se(!utf8_is_valid("\341\204")); +} + +int main(int argc, char *argv[]) { + test_utf8_is_valid(); + test_udev_encode_string(); +} |