summaryrefslogtreecommitdiff
path: root/src/shared/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/shared/utf8.c')
-rw-r--r--src/shared/utf8.c71
1 files changed, 71 insertions, 0 deletions
diff --git a/src/shared/utf8.c b/src/shared/utf8.c
index 13f0521e8c..a6f5b3f9e5 100644
--- a/src/shared/utf8.c
+++ b/src/shared/utf8.c
@@ -78,6 +78,77 @@ static inline void merge_continuation_char(uint32_t *u_ch, uint8_t ch) {
*u_ch |= ch & 0x3f;
}
+static bool is_unicode_control(uint32_t ch) {
+
+ /*
+ 0 to ' '-1 is the C0 range.
+ DEL=0x7F, and DEL+1 to 0x9F is C1 range.
+ '\t' is in C0 range, but more or less harmless and commonly used.
+ */
+
+ return (ch < ' ' && ch != '\t') ||
+ (0x7F <= ch && ch <= 0x9F);
+}
+
+char* utf8_is_printable_n(const char* str, size_t length) {
+ uint32_t val = 0;
+ uint32_t min = 0;
+ const uint8_t *p;
+
+ assert(str);
+
+ for (p = (const uint8_t*) str; length; p++, length--) {
+ if (*p < 128) {
+ val = *p;
+ } else {
+ if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
+ min = 128;
+ val = (uint32_t) (*p & 0x1e);
+ goto ONE_REMAINING;
+ } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
+ min = (1 << 11);
+ val = (uint32_t) (*p & 0x0f);
+ goto TWO_REMAINING;
+ } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
+ min = (1 << 16);
+ val = (uint32_t) (*p & 0x07);
+ } else
+ goto error;
+
+ p++;
+ length--;
+ if (!length || !is_continuation_char(*p))
+ goto error;
+ merge_continuation_char(&val, *p);
+
+ TWO_REMAINING:
+ p++;
+ length--;
+ if (!is_continuation_char(*p))
+ goto error;
+ merge_continuation_char(&val, *p);
+
+ ONE_REMAINING:
+ p++;
+ length--;
+ if (!is_continuation_char(*p))
+ goto error;
+ merge_continuation_char(&val, *p);
+
+ if (val < min)
+ goto error;
+ }
+
+ if (is_unicode_control(val))
+ goto error;
+ }
+
+ return (char*) str;
+
+error:
+ return NULL;
+}
+
static char* utf8_validate(const char *str, char *output) {
uint32_t val = 0;
uint32_t min = 0;