From b19d2d4b3a4176869c6ba164f7a67ed3b11146ca Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@cs.stanford.edu>
Date: Thu, 18 Nov 2010 21:25:39 -0800
Subject: [PATCH] str: Add some functions for handling UTF-8.

---
 Smake             |  2 ++
 src/libpspp/str.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++
 src/libpspp/str.h |  8 +++++
 3 files changed, 86 insertions(+)

diff --git a/Smake b/Smake
index 7855ce1e98..6e6b70235a 100644
--- a/Smake
+++ b/Smake
@@ -71,8 +71,10 @@ GNULIB_MODULES = \
 	unilbrk/ulc-width-linebreaks \
 	unistd \
 	unistr/u8-cpy \
+	unistr/u8-mbtouc \
 	unistr/u8-strlen \
 	unistr/u8-strncat \
+	unitypes \
 	unlocked-io \
 	vasprintf-posix \
 	version-etc \
diff --git a/src/libpspp/str.c b/src/libpspp/str.c
index cd2363a002..7b677225b9 100644
--- a/src/libpspp/str.c
+++ b/src/libpspp/str.c
@@ -22,6 +22,7 @@
 #include <errno.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <unistr.h>
 
 #include "libpspp/cast.h"
 #include "libpspp/message.h"
@@ -740,6 +741,81 @@ ss_xstrdup (struct substring ss)
   s[ss.length] = '\0';
   return s;
 }
+/* UTF-8. */
+
+/* Returns the character represented by the UTF-8 sequence at the start of S.
+   The return value is either a Unicode code point in the range 0 to 0x10ffff,
+   or UINT32_MAX if S is empty. */
+ucs4_t
+ss_first_mb (struct substring s)
+{
+  return ss_at_mb (s, 0);
+}
+
+/* Returns the number of bytes in the UTF-8 character at the beginning of S.
+
+   The return value is 0 if S is empty, otherwise between 1 and 4. */
+int
+ss_first_mblen (struct substring s)
+{
+  return ss_at_mblen (s, 0);
+}
+
+/* Advances S past the UTF-8 character at its beginning.  Returns the Unicode
+   code point that was skipped (in the range 0 to 0x10ffff), or UINT32_MAX if S
+   was not modified because it was initially empty. */
+ucs4_t
+ss_get_mb (struct substring *s)
+{
+  if (s->length > 0)
+    {
+      ucs4_t uc;
+      int n;
+
+      n = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
+      s->string += n;
+      s->length -= n;
+      return uc;
+    }
+  else
+    return UINT32_MAX;
+}
+
+/* Returns the character represented by the UTF-8 sequence starting OFS bytes
+   into S.  The return value is either a Unicode code point in the range 0 to
+   0x10ffff, or UINT32_MAX if OFS is past the last byte in S.
+
+   (Returns 0xfffd if OFS points into the middle, not the beginning, of a UTF-8
+   sequence.)  */
+ucs4_t
+ss_at_mb (struct substring s, size_t ofs)
+{
+  if (s.length > ofs)
+    {
+      ucs4_t uc;
+      u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s.string + ofs),
+                 s.length - ofs);
+      return uc;
+    }
+  else
+    return UINT32_MAX;
+}
+
+/* Returns the number of bytes represented by the UTF-8 sequence starting OFS
+   bytes into S.  The return value is 0 if OFS is past the last byte in S,
+   otherwise between 1 and 4. */
+int
+ss_at_mblen (struct substring s, size_t ofs)
+{
+  if (s.length > ofs)
+    {
+      ucs4_t uc;
+      return u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s.string + ofs),
+                        s.length - ofs);
+    }
+  else
+    return 0;
+}
 
 /* Initializes ST as an empty string. */
 void
diff --git a/src/libpspp/str.h b/src/libpspp/str.h
index ecf9e6eb3f..ddfd2f82b7 100644
--- a/src/libpspp/str.h
+++ b/src/libpspp/str.h
@@ -23,6 +23,7 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
+#include <unitypes.h>
 
 #include "compiler.h"
 #include "memcasecmp.h"
@@ -127,6 +128,13 @@ int ss_equals (struct substring, struct substring);
 int ss_equals_case (struct substring, struct substring);
 size_t ss_pointer_to_position (struct substring, const char *);
 char *ss_xstrdup (struct substring);
+
+/* UTF-8. */
+ucs4_t ss_first_mb (struct substring);
+int ss_first_mblen (struct substring);
+ucs4_t ss_get_mb (struct substring *);
+ucs4_t ss_at_mb (struct substring, size_t ofs);
+int ss_at_mblen (struct substring, size_t ofs);
 
 /* Variable length strings. */
 
-- 
2.30.2