move utf8 functions from libudev-private.h to utf8.h

There's now some more obvious overlap amongst the two utf8 validation functions, but no more than there already was previously. This also adds some menial tests for anyone who wants to do more merging of these two in the future. Adopted for eudev: Anthony G. Basile <blueness@gentoo.org> Signed-off-by: Anthony G. Basile <blueness@gentoo.org>
author: Dave Reisner <dreisner@archlinux.org> 2014-01-09 12:20:25 -0500
committer: Anthony G. Basile <blueness@gentoo.org> 2014-01-09 12:21:20 -0500
commit: 7ed87c74dfb81761cbcefc10cd4f79394a1d36a3 (patch)
tree: d42454fd7146d18041770ace67e7d623bdc77830 /src/libudev
parent: 0a6fb50746ab00d650145cf17fc1a15f109bc386 (diff)
4 files changed, 266 insertions, 170 deletions
diff --git a/src/libudev/Makefile.am b/src/libudev/Makefile.am
index 879cc90653..568c4884b4 100644
--- a/src/libudev/Makefile.am
+++ b/src/libudev/Makefile.am
@@ -43,7 +43,8 @@ libudev_la_SOURCES =\
 	strbuf.c \
 	strv.c \
 	strxcpyx.c  \
-	util.c
+	util.c \
+	utf8.c
 
 noinst_HEADERS = \
 	libudev-hwdb-def.h \
@@ -64,7 +65,8 @@ noinst_HEADERS = \
 	strbuf.h \
 	strv.h \
 	strxcpyx.h \
-	util.h
+	util.h \
+	utf8.h
 
 include_HEADERS = \
 	libudev.h
diff --git a/src/libudev/libudev-util.c b/src/libudev/libudev-util.c
index ae5e285ec0..b4452f60d4 100644
--- a/src/libudev/libudev-util.c
+++ b/src/libudev/libudev-util.c
@@ -37,6 +37,7 @@
 
 #include "libudev.h"
 #include "libudev-private.h"
+#include "utf8.h"
 
 /**
  * SECTION:libudev-util
@@ -314,129 +315,6 @@ void util_remove_trailing_chars(char *path, char c)
                 path[--len] = '\0';
 }
 
-/* count of characters used to encode one unicode char */
-static int utf8_encoded_expected_len(const char *str)
-{
-        unsigned char c = (unsigned char)str[0];
-
-        if (c < 0x80)
-                return 1;
-        if ((c & 0xe0) == 0xc0)
-                return 2;
-        if ((c & 0xf0) == 0xe0)
-                return 3;
-        if ((c & 0xf8) == 0xf0)
-                return 4;
-        if ((c & 0xfc) == 0xf8)
-                return 5;
-        if ((c & 0xfe) == 0xfc)
-                return 6;
-        return 0;
-}
-
-/* decode one unicode char */
-static int utf8_encoded_to_unichar(const char *str)
-{
-        int unichar;
-        int len;
-        int i;
-
-        len = utf8_encoded_expected_len(str);
-        switch (len) {
-        case 1:
-                return (int)str[0];
-        case 2:
-                unichar = str[0] & 0x1f;
-                break;
-        case 3:
-                unichar = (int)str[0] & 0x0f;
-                break;
-        case 4:
-                unichar = (int)str[0] & 0x07;
-                break;
-        case 5:
-                unichar = (int)str[0] & 0x03;
-                break;
-        case 6:
-                unichar = (int)str[0] & 0x01;
-                break;
-        default:
-                return -1;
-        }
-
-        for (i = 1; i < len; i++) {
-                if (((int)str[i] & 0xc0) != 0x80)
-                        return -1;
-                unichar <<= 6;
-                unichar |= (int)str[i] & 0x3f;
-        }
-
-        return unichar;
-}
-
-/* expected size used to encode one unicode char */
-static int utf8_unichar_to_encoded_len(int unichar)
-{
-        if (unichar < 0x80)
-                return 1;
-        if (unichar < 0x800)
-                return 2;
-        if (unichar < 0x10000)
-                return 3;
-        if (unichar < 0x200000)
-                return 4;
-        if (unichar < 0x4000000)
-                return 5;
-        return 6;
-}
-
-/* check if unicode char has a valid numeric range */
-static int utf8_unichar_valid_range(int unichar)
-{
-        if (unichar > 0x10ffff)
-                return 0;
-        if ((unichar & 0xfffff800) == 0xd800)
-                return 0;
-        if ((unichar > 0xfdcf) && (unichar < 0xfdf0))
-                return 0;
-        if ((unichar & 0xffff) == 0xffff)
-                return 0;
-        return 1;
-}
-
-/* validate one encoded unicode char and return its length */
-static int utf8_encoded_valid_unichar(const char *str)
-{
-        int len;
-        int unichar;
-        int i;
-
-        len = utf8_encoded_expected_len(str);
-        if (len == 0)
-                return -1;
-
-        /* ascii is valid */
-        if (len == 1)
-                return 1;
-
-        /* check if expected encoded chars are available */
-        for (i = 0; i < len; i++)
-                if ((str[i] & 0x80) != 0x80)
-                        return -1;
-
-        unichar = utf8_encoded_to_unichar(str);
-
-        /* check if encoded length matches encoded value */
-        if (utf8_unichar_to_encoded_len(unichar) != len)
-                return -1;
-
-        /* check if value has valid range */
-        if (!utf8_unichar_valid_range(unichar))
-                return -1;
-
-        return len;
-}
-
 int util_replace_whitespace(const char *str, char *to, size_t len)
 {
         size_t i, j;
@@ -465,17 +343,6 @@ int util_replace_whitespace(const char *str, char *to, size_t len)
         return 0;
 }
 
-static int is_whitelisted(char c, const char *white)
-{
-        if ((c >= '0' && c <= '9') ||
-            (c >= 'A' && c <= 'Z') ||
-            (c >= 'a' && c <= 'z') ||
-            strchr("#+-.:=@_", c) != NULL ||
-            (white != NULL && strchr(white, c) != NULL))
-                return 1;
-        return 0;
-}
-
 /* allow chars in whitelist, plain ascii, hex-escaping and valid utf8 */
 int util_replace_chars(char *str, const char *white)
 {
@@ -485,7 +352,7 @@ int util_replace_chars(char *str, const char *white)
         while (str[i] != '\0') {
                 int len;
 
-                if (is_whitelisted(str[i], white)) {
+                if (is_utf8_encoding_whitelisted(str[i], white)) {
                         i++;
                         continue;
                 }
@@ -533,39 +400,7 @@ int util_replace_chars(char *str, const char *white)
  **/
 _public_ int udev_util_encode_string(const char *str, char *str_enc, size_t len)
 {
-        size_t i, j;
-
-        if (str == NULL || str_enc == NULL)
-                return -1;
-
-        for (i = 0, j = 0; str[i] != '\0'; i++) {
-                int seqlen;
-
-                seqlen = utf8_encoded_valid_unichar(&str[i]);
-                if (seqlen > 1) {
-                        if (len-j < (size_t)seqlen)
-                                goto err;
-                        memcpy(&str_enc[j], &str[i], seqlen);
-                        j += seqlen;
-                        i += (seqlen-1);
-                } else if (str[i] == '\\' || !is_whitelisted(str[i], NULL)) {
-                        if (len-j < 4)
-                                goto err;
-                        sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);
-                        j += 4;
-                } else {
-                        if (len-j < 1)
-                                goto err;
-                        str_enc[j] = str[i];
-                        j++;
-                }
-        }
-        if (len-j < 1)
-                goto err;
-        str_enc[j] = '\0';
-        return 0;
-err:
-        return -1;
+        return udev_encode_string(str, str_enc, len);
 }
 
 /*
diff --git a/src/libudev/utf8.c b/src/libudev/utf8.c
new file mode 100644
index 0000000000..c9e84b804c
--- /dev/null
+++ b/src/libudev/utf8.c
@@ -0,0 +1,233 @@
+/***
+  This file is part of udev, forked from systemd.
+
+  Copyright 2008-2011 Kay Sievers
+  Copyright 2012 Lennart Poettering
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+/* Parts of this file are based on the GLIB utf8 validation functions. The
+ * original license text follows. */
+
+/* gutf8.c - Operations on UTF-8 strings.
+ *
+ * Copyright (C) 1999 Tom Tromey
+ * Copyright (C) 2000 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "utf8.h"
+#include "util.h"
+
+static inline bool is_unicode_valid(uint32_t ch) {
+
+        if (ch >= 0x110000) /* End of unicode space */
+                return false;
+        if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
+                return false;
+        if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
+                return false;
+        if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
+                return false;
+
+        return true;
+}
+/* count of characters used to encode one unicode char */
+static int utf8_encoded_expected_len(const char *str) {
+        unsigned char c = (unsigned char)str[0];
+
+        if (c < 0x80)
+                return 1;
+        if ((c & 0xe0) == 0xc0)
+                return 2;
+        if ((c & 0xf0) == 0xe0)
+                return 3;
+        if ((c & 0xf8) == 0xf0)
+                return 4;
+        if ((c & 0xfc) == 0xf8)
+                return 5;
+        if ((c & 0xfe) == 0xfc)
+                return 6;
+        return 0;
+}
+
+/* decode one unicode char */
+int utf8_encoded_to_unichar(const char *str) {
+        int unichar;
+        int len;
+        int i;
+
+        len = utf8_encoded_expected_len(str);
+        switch (len) {
+        case 1:
+                return (int)str[0];
+        case 2:
+                unichar = str[0] & 0x1f;
+                break;
+        case 3:
+                unichar = (int)str[0] & 0x0f;
+                break;
+        case 4:
+                unichar = (int)str[0] & 0x07;
+                break;
+        case 5:
+                unichar = (int)str[0] & 0x03;
+                break;
+        case 6:
+                unichar = (int)str[0] & 0x01;
+                break;
+        default:
+                return -1;
+        }
+
+        for (i = 1; i < len; i++) {
+                if (((int)str[i] & 0xc0) != 0x80)
+                        return -1;
+                unichar <<= 6;
+                unichar |= (int)str[i] & 0x3f;
+        }
+
+        return unichar;
+}
+
+const char *utf8_is_valid(const char *str) {
+        const uint8_t *p;
+
+        assert(str);
+
+        for (p = (const uint8_t*) str; *p; ) {
+                int len;
+
+                len = utf8_encoded_valid_unichar((const char *)p);
+
+                if (len < 0)
+                        return NULL;
+
+                p += len;
+        }
+
+        return str;
+}
+/* expected size used to encode one unicode char */
+static int utf8_unichar_to_encoded_len(int unichar) {
+        if (unichar < 0x80)
+                return 1;
+        if (unichar < 0x800)
+                return 2;
+        if (unichar < 0x10000)
+                return 3;
+        if (unichar < 0x200000)
+                return 4;
+        if (unichar < 0x4000000)
+                return 5;
+        return 6;
+}
+
+/* validate one encoded unicode char and return its length */
+int utf8_encoded_valid_unichar(const char *str) {
+        int len;
+        int unichar;
+        int i;
+
+        len = utf8_encoded_expected_len(str);
+        if (len == 0)
+                return -1;
+
+        /* ascii is valid */
+        if (len == 1)
+                return 1;
+
+        /* check if expected encoded chars are available */
+        for (i = 0; i < len; i++)
+                if ((str[i] & 0x80) != 0x80)
+                        return -1;
+
+        unichar = utf8_encoded_to_unichar(str);
+
+        /* check if encoded length matches encoded value */
+        if (utf8_unichar_to_encoded_len(unichar) != len)
+                return -1;
+
+        /* check if value has valid range */
+        if (!is_unicode_valid(unichar))
+                return -1;
+
+        return len;
+}
+
+int is_utf8_encoding_whitelisted(char c, const char *white) {
+        if ((c >= '0' && c <= '9') ||
+            (c >= 'A' && c <= 'Z') ||
+            (c >= 'a' && c <= 'z') ||
+            strchr("#+-.:=@_", c) != NULL ||
+            (white != NULL && strchr(white, c) != NULL))
+                return 1;
+        return 0;
+}
+
+int udev_encode_string(const char *str, char *str_enc, size_t len) {
+        size_t i, j;
+
+        if (str == NULL || str_enc == NULL)
+                return -1;
+
+        for (i = 0, j = 0; str[i] != '\0'; i++) {
+                int seqlen;
+
+                seqlen = utf8_encoded_valid_unichar(&str[i]);
+                if (seqlen > 1) {
+                        if (len-j < (size_t)seqlen)
+                                goto err;
+                        memcpy(&str_enc[j], &str[i], seqlen);
+                        j += seqlen;
+                        i += (seqlen-1);
+                } else if (str[i] == '\\' || !is_utf8_encoding_whitelisted(str[i], NULL)) {
+                        if (len-j < 4)
+                                goto err;
+                        sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);
+                        j += 4;
+                } else {
+                        if (len-j < 1)
+                                goto err;
+                        str_enc[j] = str[i];
+                        j++;
+                }
+        }
+        if (len-j < 1)
+                goto err;
+        str_enc[j] = '\0';
+        return 0;
+err:
+        return -1;
+}
diff --git a/src/libudev/utf8.h b/src/libudev/utf8.h
new file mode 100644
index 0000000000..380036da18
--- /dev/null
+++ b/src/libudev/utf8.h
@@ -0,0 +1,26 @@
+/***
+  This file is part of eudev, forked from systemd
+
+  Copyright 2012 Lennart Poettering
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <stdbool.h>
+
+#include "macro.h"
+
+int utf8_encoded_valid_unichar(const char *str);
+int is_utf8_encoding_whitelisted(char c, const char *white);
+int udev_encode_string(const char *str, char *str_enc, size_t len);
author	Dave Reisner <dreisner@archlinux.org>	2014-01-09 12:20:25 -0500
committer	Anthony G. Basile <blueness@gentoo.org>	2014-01-09 12:21:20 -0500
commit	7ed87c74dfb81761cbcefc10cd4f79394a1d36a3 (patch)
tree	d42454fd7146d18041770ace67e7d623bdc77830 /src/libudev
parent	0a6fb50746ab00d650145cf17fc1a15f109bc386 (diff)