util, utf8: make ellipsize take multi-byte characters into account

rename old versions to ascii_* Do not take into account zerowidth characters, but do consider double-wide characters. Import needed utf8 helper code from glib. v3: rebase ontop of utf8 restructuring work [zj: tweak the algorithm a bit, move new code to separate file]
author: Shawn Landden <shawn@churchofgit.com> 2013-09-20 18:37:33 -0700
committer: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> 2013-10-13 17:56:54 -0400
commit: f405e86de361ec305dc2b8634efeaa23dc144053 (patch)
tree: b3fe37e4a143a5d2dee14eb34d96f96cb03b84fc /src/shared/gunicode.c
parent: 14a9283eb38a93ec384c322ccbe06352c86a25f8 (diff)
1 files changed, 108 insertions, 0 deletions
diff --git a/src/shared/gunicode.c b/src/shared/gunicode.c
new file mode 100644
index 0000000000..f2d4608340
--- /dev/null
+++ b/src/shared/gunicode.c
@@ -0,0 +1,108 @@
+/* gunicode.c - Unicode manipulation functions
+ *
+ *  Copyright (C) 1999, 2000 Tom Tromey
+ *  Copyright 2000, 2005 Red Hat, Inc.
+ */
+
+#include "gunicode.h"
+
+#define unichar uint32_t
+
+/**
+ * g_utf8_prev_char:
+ * @p: a pointer to a position within a UTF-8 encoded string
+ *
+ * Finds the previous UTF-8 character in the string before @p.
+ *
+ * @p does not have to be at the beginning of a UTF-8 character. No check
+ * is made to see if the character found is actually valid other than
+ * it starts with an appropriate byte. If @p might be the first
+ * character of the string, you must use g_utf8_find_prev_char() instead.
+ *
+ * Return value: a pointer to the found character.
+ **/
+char *
+utf8_prev_char (const char *p)
+{
+  while (1)
+    {
+      p--;
+      if ((*p & 0xc0) != 0x80)
+        return (char *)p;
+    }
+}
+
+struct Interval
+{
+  unichar start, end;
+};
+
+static int
+interval_compare (const void *key, const void *elt)
+{
+  unichar c = (unichar) (long) (key);
+  struct Interval *interval = (struct Interval *)elt;
+
+  if (c < interval->start)
+    return -1;
+  if (c > interval->end)
+    return +1;
+
+  return 0;
+}
+
+/*
+ * NOTE:
+ *
+ * The tables for g_unichar_iswide() and g_unichar_iswide_cjk() are
+ * generated from the Unicode Character Database's file
+ * extracted/DerivedEastAsianWidth.txt using the gen-iswide-table.py
+ * in this way:
+ *
+ *   ./gen-iswide-table.py < path/to/ucd/extracted/DerivedEastAsianWidth.txt | fmt
+ *
+ * Last update for Unicode 6.0.
+ */
+
+/**
+ * g_unichar_iswide:
+ * @c: a Unicode character
+ *
+ * Determines if a character is typically rendered in a double-width
+ * cell.
+ *
+ * Return value: %TRUE if the character is wide
+ **/
+bool
+unichar_iswide (unichar c)
+{
+  /* See NOTE earlier for how to update this table. */
+  static const struct Interval wide[] = {
+    {0x1100, 0x115F}, {0x2329, 0x232A}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
+    {0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3000, 0x303E}, {0x3041, 0x3096},
+    {0x3099, 0x30FF}, {0x3105, 0x312D}, {0x3131, 0x318E}, {0x3190, 0x31BA},
+    {0x31C0, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0x3247}, {0x3250, 0x32FE},
+    {0x3300, 0x4DBF}, {0x4E00, 0xA48C}, {0xA490, 0xA4C6}, {0xA960, 0xA97C},
+    {0xAC00, 0xD7A3}, {0xF900, 0xFAFF}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52},
+    {0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFF01, 0xFF60}, {0xFFE0, 0xFFE6},
+    {0x1B000, 0x1B001}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23A}, {0x1F240,
+    0x1F248}, {0x1F250, 0x1F251}, {0x20000, 0x2FFFD}, {0x30000, 0x3FFFD}
+  };
+
+  if (bsearch ((void *)(uintptr_t)c, wide, (sizeof (wide) / sizeof ((wide)[0])), sizeof wide[0],
+               interval_compare))
+    return true;
+
+  return false;
+}
+
+const char utf8_skip_data[256] = {
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+};
author	Shawn Landden <shawn@churchofgit.com>	2013-09-20 18:37:33 -0700
committer	Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>	2013-10-13 17:56:54 -0400
commit	f405e86de361ec305dc2b8634efeaa23dc144053 (patch)
tree	b3fe37e4a143a5d2dee14eb34d96f96cb03b84fc /src/shared/gunicode.c
parent	14a9283eb38a93ec384c322ccbe06352c86a25f8 (diff)