summaryrefslogtreecommitdiff
path: root/src/shared/utf8.c
diff options
context:
space:
mode:
authorDave Reisner <dreisner@archlinux.org>2013-09-17 15:39:09 -0400
committerDave Reisner <dreisner@archlinux.org>2013-09-17 16:31:32 -0400
commit02a36bc9a1769c744f141f127898dbe076dbfd96 (patch)
treed6ef940458ee7a79e8a03ddba80735fd76512a1d /src/shared/utf8.c
parentbf24e638afb32ebc49b063c007888410fb9d19bd (diff)
move utf8 functions from libudev-private.h to utf8.h
There's now some more obvious overlap amongst the two utf8 validation functions, but no more than there already was previously. This also adds some menial tests for anyone who wants to do more merging of these two in the future.
Diffstat (limited to 'src/shared/utf8.c')
-rw-r--r--src/shared/utf8.c151
1 files changed, 151 insertions, 0 deletions
diff --git a/src/shared/utf8.c b/src/shared/utf8.c
index 655cc771d4..1a68394a53 100644
--- a/src/shared/utf8.c
+++ b/src/shared/utf8.c
@@ -317,3 +317,154 @@ char *utf16_to_utf8(const void *s, size_t length) {
return r;
}
+
+/* count of characters used to encode one unicode char */
+static int utf8_encoded_expected_len(const char *str) {
+ unsigned char c = (unsigned char)str[0];
+
+ if (c < 0x80)
+ return 1;
+ if ((c & 0xe0) == 0xc0)
+ return 2;
+ if ((c & 0xf0) == 0xe0)
+ return 3;
+ if ((c & 0xf8) == 0xf0)
+ return 4;
+ if ((c & 0xfc) == 0xf8)
+ return 5;
+ if ((c & 0xfe) == 0xfc)
+ return 6;
+ return 0;
+}
+
+/* decode one unicode char */
+static int utf8_encoded_to_unichar(const char *str) {
+ int unichar;
+ int len;
+ int i;
+
+ len = utf8_encoded_expected_len(str);
+ switch (len) {
+ case 1:
+ return (int)str[0];
+ case 2:
+ unichar = str[0] & 0x1f;
+ break;
+ case 3:
+ unichar = (int)str[0] & 0x0f;
+ break;
+ case 4:
+ unichar = (int)str[0] & 0x07;
+ break;
+ case 5:
+ unichar = (int)str[0] & 0x03;
+ break;
+ case 6:
+ unichar = (int)str[0] & 0x01;
+ break;
+ default:
+ return -1;
+ }
+
+ for (i = 1; i < len; i++) {
+ if (((int)str[i] & 0xc0) != 0x80)
+ return -1;
+ unichar <<= 6;
+ unichar |= (int)str[i] & 0x3f;
+ }
+
+ return unichar;
+}
+
+/* expected size used to encode one unicode char */
+static int utf8_unichar_to_encoded_len(int unichar) {
+ if (unichar < 0x80)
+ return 1;
+ if (unichar < 0x800)
+ return 2;
+ if (unichar < 0x10000)
+ return 3;
+ if (unichar < 0x200000)
+ return 4;
+ if (unichar < 0x4000000)
+ return 5;
+ return 6;
+}
+
+/* validate one encoded unicode char and return its length */
+int utf8_encoded_valid_unichar(const char *str) {
+ int len;
+ int unichar;
+ int i;
+
+ len = utf8_encoded_expected_len(str);
+ if (len == 0)
+ return -1;
+
+ /* ascii is valid */
+ if (len == 1)
+ return 1;
+
+ /* check if expected encoded chars are available */
+ for (i = 0; i < len; i++)
+ if ((str[i] & 0x80) != 0x80)
+ return -1;
+
+ unichar = utf8_encoded_to_unichar(str);
+
+ /* check if encoded length matches encoded value */
+ if (utf8_unichar_to_encoded_len(unichar) != len)
+ return -1;
+
+ /* check if value has valid range */
+ if (!is_unicode_valid(unichar))
+ return -1;
+
+ return len;
+}
+
+int is_utf8_encoding_whitelisted(char c, const char *white) {
+ if ((c >= '0' && c <= '9') ||
+ (c >= 'A' && c <= 'Z') ||
+ (c >= 'a' && c <= 'z') ||
+ strchr("#+-.:=@_", c) != NULL ||
+ (white != NULL && strchr(white, c) != NULL))
+ return 1;
+ return 0;
+}
+
+int udev_encode_string(const char *str, char *str_enc, size_t len) {
+ size_t i, j;
+
+ if (str == NULL || str_enc == NULL)
+ return -1;
+
+ for (i = 0, j = 0; str[i] != '\0'; i++) {
+ int seqlen;
+
+ seqlen = utf8_encoded_valid_unichar(&str[i]);
+ if (seqlen > 1) {
+ if (len-j < (size_t)seqlen)
+ goto err;
+ memcpy(&str_enc[j], &str[i], seqlen);
+ j += seqlen;
+ i += (seqlen-1);
+ } else if (str[i] == '\\' || !is_utf8_encoding_whitelisted(str[i], NULL)) {
+ if (len-j < 4)
+ goto err;
+ sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);
+ j += 4;
+ } else {
+ if (len-j < 1)
+ goto err;
+ str_enc[j] = str[i];
+ j++;
+ }
+ }
+ if (len-j < 1)
+ goto err;
+ str_enc[j] = '\0';
+ return 0;
+err:
+ return -1;
+}