diff options
| -rw-r--r-- | src/libudev/Makefile.am | 6 | ||||
| -rw-r--r-- | src/libudev/libudev-util.c | 171 | ||||
| -rw-r--r-- | src/libudev/utf8.c | 233 | ||||
| -rw-r--r-- | src/libudev/utf8.h | 26 | ||||
| -rw-r--r-- | test/Makefile.am | 12 | ||||
| -rw-r--r-- | test/test-utf8.c | 59 | 
6 files changed, 336 insertions, 171 deletions
| diff --git a/src/libudev/Makefile.am b/src/libudev/Makefile.am index 879cc90653..568c4884b4 100644 --- a/src/libudev/Makefile.am +++ b/src/libudev/Makefile.am @@ -43,7 +43,8 @@ libudev_la_SOURCES =\  	strbuf.c \  	strv.c \  	strxcpyx.c  \ -	util.c +	util.c \ +	utf8.c  noinst_HEADERS = \  	libudev-hwdb-def.h \ @@ -64,7 +65,8 @@ noinst_HEADERS = \  	strbuf.h \  	strv.h \  	strxcpyx.h \ -	util.h +	util.h \ +	utf8.h  include_HEADERS = \  	libudev.h diff --git a/src/libudev/libudev-util.c b/src/libudev/libudev-util.c index ae5e285ec0..b4452f60d4 100644 --- a/src/libudev/libudev-util.c +++ b/src/libudev/libudev-util.c @@ -37,6 +37,7 @@  #include "libudev.h"  #include "libudev-private.h" +#include "utf8.h"  /**   * SECTION:libudev-util @@ -314,129 +315,6 @@ void util_remove_trailing_chars(char *path, char c)                  path[--len] = '\0';  } -/* count of characters used to encode one unicode char */ -static int utf8_encoded_expected_len(const char *str) -{ -        unsigned char c = (unsigned char)str[0]; - -        if (c < 0x80) -                return 1; -        if ((c & 0xe0) == 0xc0) -                return 2; -        if ((c & 0xf0) == 0xe0) -                return 3; -        if ((c & 0xf8) == 0xf0) -                return 4; -        if ((c & 0xfc) == 0xf8) -                return 5; -        if ((c & 0xfe) == 0xfc) -                return 6; -        return 0; -} - -/* decode one unicode char */ -static int utf8_encoded_to_unichar(const char *str) -{ -        int unichar; -        int len; -        int i; - -        len = utf8_encoded_expected_len(str); -        switch (len) { -        case 1: -                return (int)str[0]; -        case 2: -                unichar = str[0] & 0x1f; -                break; -        case 3: -                unichar = (int)str[0] & 0x0f; -                break; -        case 4: -                unichar = (int)str[0] & 0x07; -                break; -        case 5: -                unichar = (int)str[0] & 0x03; -                break; -        case 6: -                unichar = (int)str[0] & 0x01; -                break; -        default: -                return -1; -        } - -        for (i = 1; i < len; i++) { -                if (((int)str[i] & 0xc0) != 0x80) -                        return -1; -                unichar <<= 6; -                unichar |= (int)str[i] & 0x3f; -        } - -        return unichar; -} - -/* expected size used to encode one unicode char */ -static int utf8_unichar_to_encoded_len(int unichar) -{ -        if (unichar < 0x80) -                return 1; -        if (unichar < 0x800) -                return 2; -        if (unichar < 0x10000) -                return 3; -        if (unichar < 0x200000) -                return 4; -        if (unichar < 0x4000000) -                return 5; -        return 6; -} - -/* check if unicode char has a valid numeric range */ -static int utf8_unichar_valid_range(int unichar) -{ -        if (unichar > 0x10ffff) -                return 0; -        if ((unichar & 0xfffff800) == 0xd800) -                return 0; -        if ((unichar > 0xfdcf) && (unichar < 0xfdf0)) -                return 0; -        if ((unichar & 0xffff) == 0xffff) -                return 0; -        return 1; -} - -/* validate one encoded unicode char and return its length */ -static int utf8_encoded_valid_unichar(const char *str) -{ -        int len; -        int unichar; -        int i; - -        len = utf8_encoded_expected_len(str); -        if (len == 0) -                return -1; - -        /* ascii is valid */ -        if (len == 1) -                return 1; - -        /* check if expected encoded chars are available */ -        for (i = 0; i < len; i++) -                if ((str[i] & 0x80) != 0x80) -                        return -1; - -        unichar = utf8_encoded_to_unichar(str); - -        /* check if encoded length matches encoded value */ -        if (utf8_unichar_to_encoded_len(unichar) != len) -                return -1; - -        /* check if value has valid range */ -        if (!utf8_unichar_valid_range(unichar)) -                return -1; - -        return len; -} -  int util_replace_whitespace(const char *str, char *to, size_t len)  {          size_t i, j; @@ -465,17 +343,6 @@ int util_replace_whitespace(const char *str, char *to, size_t len)          return 0;  } -static int is_whitelisted(char c, const char *white) -{ -        if ((c >= '0' && c <= '9') || -            (c >= 'A' && c <= 'Z') || -            (c >= 'a' && c <= 'z') || -            strchr("#+-.:=@_", c) != NULL || -            (white != NULL && strchr(white, c) != NULL)) -                return 1; -        return 0; -} -  /* allow chars in whitelist, plain ascii, hex-escaping and valid utf8 */  int util_replace_chars(char *str, const char *white)  { @@ -485,7 +352,7 @@ int util_replace_chars(char *str, const char *white)          while (str[i] != '\0') {                  int len; -                if (is_whitelisted(str[i], white)) { +                if (is_utf8_encoding_whitelisted(str[i], white)) {                          i++;                          continue;                  } @@ -533,39 +400,7 @@ int util_replace_chars(char *str, const char *white)   **/  _public_ int udev_util_encode_string(const char *str, char *str_enc, size_t len)  { -        size_t i, j; - -        if (str == NULL || str_enc == NULL) -                return -1; - -        for (i = 0, j = 0; str[i] != '\0'; i++) { -                int seqlen; - -                seqlen = utf8_encoded_valid_unichar(&str[i]); -                if (seqlen > 1) { -                        if (len-j < (size_t)seqlen) -                                goto err; -                        memcpy(&str_enc[j], &str[i], seqlen); -                        j += seqlen; -                        i += (seqlen-1); -                } else if (str[i] == '\\' || !is_whitelisted(str[i], NULL)) { -                        if (len-j < 4) -                                goto err; -                        sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]); -                        j += 4; -                } else { -                        if (len-j < 1) -                                goto err; -                        str_enc[j] = str[i]; -                        j++; -                } -        } -        if (len-j < 1) -                goto err; -        str_enc[j] = '\0'; -        return 0; -err: -        return -1; +        return udev_encode_string(str, str_enc, len);  }  /* diff --git a/src/libudev/utf8.c b/src/libudev/utf8.c new file mode 100644 index 0000000000..c9e84b804c --- /dev/null +++ b/src/libudev/utf8.c @@ -0,0 +1,233 @@ +/*** +  This file is part of udev, forked from systemd. + +  Copyright 2008-2011 Kay Sievers +  Copyright 2012 Lennart Poettering + +  systemd is free software; you can redistribute it and/or modify it +  under the terms of the GNU Lesser General Public License as published by +  the Free Software Foundation; either version 2.1 of the License, or +  (at your option) any later version. + +  systemd is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +  Lesser General Public License for more details. + +  You should have received a copy of the GNU Lesser General Public License +  along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +/* Parts of this file are based on the GLIB utf8 validation functions. The + * original license text follows. */ + +/* gutf8.c - Operations on UTF-8 strings. + * + * Copyright (C) 1999 Tom Tromey + * Copyright (C) 2000 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA + */ + +#include <errno.h> +#include <stdlib.h> +#include <inttypes.h> +#include <string.h> +#include <stdbool.h> + +#include "utf8.h" +#include "util.h" + +static inline bool is_unicode_valid(uint32_t ch) { + +        if (ch >= 0x110000) /* End of unicode space */ +                return false; +        if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */ +                return false; +        if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */ +                return false; +        if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */ +                return false; + +        return true; +} +/* count of characters used to encode one unicode char */ +static int utf8_encoded_expected_len(const char *str) { +        unsigned char c = (unsigned char)str[0]; + +        if (c < 0x80) +                return 1; +        if ((c & 0xe0) == 0xc0) +                return 2; +        if ((c & 0xf0) == 0xe0) +                return 3; +        if ((c & 0xf8) == 0xf0) +                return 4; +        if ((c & 0xfc) == 0xf8) +                return 5; +        if ((c & 0xfe) == 0xfc) +                return 6; +        return 0; +} + +/* decode one unicode char */ +int utf8_encoded_to_unichar(const char *str) { +        int unichar; +        int len; +        int i; + +        len = utf8_encoded_expected_len(str); +        switch (len) { +        case 1: +                return (int)str[0]; +        case 2: +                unichar = str[0] & 0x1f; +                break; +        case 3: +                unichar = (int)str[0] & 0x0f; +                break; +        case 4: +                unichar = (int)str[0] & 0x07; +                break; +        case 5: +                unichar = (int)str[0] & 0x03; +                break; +        case 6: +                unichar = (int)str[0] & 0x01; +                break; +        default: +                return -1; +        } + +        for (i = 1; i < len; i++) { +                if (((int)str[i] & 0xc0) != 0x80) +                        return -1; +                unichar <<= 6; +                unichar |= (int)str[i] & 0x3f; +        } + +        return unichar; +} + +const char *utf8_is_valid(const char *str) { +        const uint8_t *p; + +        assert(str); + +        for (p = (const uint8_t*) str; *p; ) { +                int len; + +                len = utf8_encoded_valid_unichar((const char *)p); + +                if (len < 0) +                        return NULL; + +                p += len; +        } + +        return str; +} +/* expected size used to encode one unicode char */ +static int utf8_unichar_to_encoded_len(int unichar) { +        if (unichar < 0x80) +                return 1; +        if (unichar < 0x800) +                return 2; +        if (unichar < 0x10000) +                return 3; +        if (unichar < 0x200000) +                return 4; +        if (unichar < 0x4000000) +                return 5; +        return 6; +} + +/* validate one encoded unicode char and return its length */ +int utf8_encoded_valid_unichar(const char *str) { +        int len; +        int unichar; +        int i; + +        len = utf8_encoded_expected_len(str); +        if (len == 0) +                return -1; + +        /* ascii is valid */ +        if (len == 1) +                return 1; + +        /* check if expected encoded chars are available */ +        for (i = 0; i < len; i++) +                if ((str[i] & 0x80) != 0x80) +                        return -1; + +        unichar = utf8_encoded_to_unichar(str); + +        /* check if encoded length matches encoded value */ +        if (utf8_unichar_to_encoded_len(unichar) != len) +                return -1; + +        /* check if value has valid range */ +        if (!is_unicode_valid(unichar)) +                return -1; + +        return len; +} + +int is_utf8_encoding_whitelisted(char c, const char *white) { +        if ((c >= '0' && c <= '9') || +            (c >= 'A' && c <= 'Z') || +            (c >= 'a' && c <= 'z') || +            strchr("#+-.:=@_", c) != NULL || +            (white != NULL && strchr(white, c) != NULL)) +                return 1; +        return 0; +} + +int udev_encode_string(const char *str, char *str_enc, size_t len) { +        size_t i, j; + +        if (str == NULL || str_enc == NULL) +                return -1; + +        for (i = 0, j = 0; str[i] != '\0'; i++) { +                int seqlen; + +                seqlen = utf8_encoded_valid_unichar(&str[i]); +                if (seqlen > 1) { +                        if (len-j < (size_t)seqlen) +                                goto err; +                        memcpy(&str_enc[j], &str[i], seqlen); +                        j += seqlen; +                        i += (seqlen-1); +                } else if (str[i] == '\\' || !is_utf8_encoding_whitelisted(str[i], NULL)) { +                        if (len-j < 4) +                                goto err; +                        sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]); +                        j += 4; +                } else { +                        if (len-j < 1) +                                goto err; +                        str_enc[j] = str[i]; +                        j++; +                } +        } +        if (len-j < 1) +                goto err; +        str_enc[j] = '\0'; +        return 0; +err: +        return -1; +} diff --git a/src/libudev/utf8.h b/src/libudev/utf8.h new file mode 100644 index 0000000000..380036da18 --- /dev/null +++ b/src/libudev/utf8.h @@ -0,0 +1,26 @@ +/*** +  This file is part of eudev, forked from systemd + +  Copyright 2012 Lennart Poettering + +  systemd is free software; you can redistribute it and/or modify it +  under the terms of the GNU Lesser General Public License as published by +  the Free Software Foundation; either version 2.1 of the License, or +  (at your option) any later version. + +  systemd is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +  Lesser General Public License for more details. + +  You should have received a copy of the GNU Lesser General Public License +  along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <stdbool.h> + +#include "macro.h" + +int utf8_encoded_valid_unichar(const char *str); +int is_utf8_encoding_whitelisted(char c, const char *white); +int udev_encode_string(const char *str, char *str_enc, size_t len); diff --git a/test/Makefile.am b/test/Makefile.am index eac141f121..a9c28c9316 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -8,7 +8,8 @@ AM_CPPFLAGS = \  noinst_PROGRAMS = \  	test-libudev \ -	test-udev +	test-udev \ +	test_utf8  test_libudev_SOURCES = \  	test-libudev.c @@ -30,6 +31,15 @@ test_udev_LDADD = \  	$(BLKID_LIBS) \  	$(SELINUX_LIBS) +test_utf8_SOURCES = \ +	test-utf8.c + +test_utf8_CFLAGS = \ +	$(AM_CFLAGS) + +test_utf8_LDADD = \ +	$(top_builddir)/src/libudev/libudev-private.la +  if HAVE_LIBKMOD  test_udev_LDADD += $(KMOD_LIBS)  endif diff --git a/test/test-utf8.c b/test/test-utf8.c new file mode 100644 index 0000000000..d2b9771f4b --- /dev/null +++ b/test/test-utf8.c @@ -0,0 +1,59 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** +  This file is part of systemd. + +  Copyright 2013 Dave Reisner + +  systemd is free software; you can redistribute it and/or modify it +  under the terms of the GNU Lesser General Public License as published by +  the Free Software Foundation; either version 2.1 of the License, or +  (at your option) any later version. + +  systemd is distributed in the hope that it will be useful, but +  WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +  Lesser General Public License for more details. + +  You should have received a copy of the GNU Lesser General Public License +  along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + + +#include "utf8.h" +#include "util.h" + +/* helpers for test_udev_encode_string */ +static char *do_encode_string(const char *in) { +        size_t out_len = strlen(in) * 4; +        char *out = malloc(out_len); + +        assert_se(out); +        assert_se(udev_encode_string(in, out, out_len) >= 0); +        puts(out); + +        return out; +} + +static bool expect_encoded_as(const char *in, const char *expected) { +        _cleanup_free_ char *encoded = do_encode_string(in); +        return streq(encoded, expected); +} + +static void test_udev_encode_string(void) { +        assert_se(expect_encoded_as("systemd sucks", "systemd\\x20sucks")); +        assert_se(expect_encoded_as("pinkiepie", "pinkiepie")); +        assert_se(expect_encoded_as("valíd\\ųtf8", "valíd\\x5cųtf8")); +        assert_se(expect_encoded_as("s/ash/ng", "s\\x2fash\\x2fng")); +} + +static void test_utf8_is_valid(void) { +        assert_se(utf8_is_valid("ascii is valid unicode")); +        assert_se(utf8_is_valid("\341\204\242")); +        assert_se(!utf8_is_valid("\341\204")); +} + +int main(int argc, char *argv[]) { +        test_utf8_is_valid(); +        test_udev_encode_string(); +} | 
