From b19d2d4b3a4176869c6ba164f7a67ed3b11146ca Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Thu, 18 Nov 2010 21:25:39 -0800 Subject: [PATCH] str: Add some functions for handling UTF-8. --- Smake | 2 ++ src/libpspp/str.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++ src/libpspp/str.h | 8 +++++ 3 files changed, 86 insertions(+) diff --git a/Smake b/Smake index 7855ce1e98..6e6b70235a 100644 --- a/Smake +++ b/Smake @@ -71,8 +71,10 @@ GNULIB_MODULES = \ unilbrk/ulc-width-linebreaks \ unistd \ unistr/u8-cpy \ + unistr/u8-mbtouc \ unistr/u8-strlen \ unistr/u8-strncat \ + unitypes \ unlocked-io \ vasprintf-posix \ version-etc \ diff --git a/src/libpspp/str.c b/src/libpspp/str.c index cd2363a002..7b677225b9 100644 --- a/src/libpspp/str.c +++ b/src/libpspp/str.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "libpspp/cast.h" #include "libpspp/message.h" @@ -740,6 +741,81 @@ ss_xstrdup (struct substring ss) s[ss.length] = '\0'; return s; } +/* UTF-8. */ + +/* Returns the character represented by the UTF-8 sequence at the start of S. + The return value is either a Unicode code point in the range 0 to 0x10ffff, + or UINT32_MAX if S is empty. */ +ucs4_t +ss_first_mb (struct substring s) +{ + return ss_at_mb (s, 0); +} + +/* Returns the number of bytes in the UTF-8 character at the beginning of S. + + The return value is 0 if S is empty, otherwise between 1 and 4. */ +int +ss_first_mblen (struct substring s) +{ + return ss_at_mblen (s, 0); +} + +/* Advances S past the UTF-8 character at its beginning. Returns the Unicode + code point that was skipped (in the range 0 to 0x10ffff), or UINT32_MAX if S + was not modified because it was initially empty. */ +ucs4_t +ss_get_mb (struct substring *s) +{ + if (s->length > 0) + { + ucs4_t uc; + int n; + + n = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length); + s->string += n; + s->length -= n; + return uc; + } + else + return UINT32_MAX; +} + +/* Returns the character represented by the UTF-8 sequence starting OFS bytes + into S. The return value is either a Unicode code point in the range 0 to + 0x10ffff, or UINT32_MAX if OFS is past the last byte in S. + + (Returns 0xfffd if OFS points into the middle, not the beginning, of a UTF-8 + sequence.) */ +ucs4_t +ss_at_mb (struct substring s, size_t ofs) +{ + if (s.length > ofs) + { + ucs4_t uc; + u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s.string + ofs), + s.length - ofs); + return uc; + } + else + return UINT32_MAX; +} + +/* Returns the number of bytes represented by the UTF-8 sequence starting OFS + bytes into S. The return value is 0 if OFS is past the last byte in S, + otherwise between 1 and 4. */ +int +ss_at_mblen (struct substring s, size_t ofs) +{ + if (s.length > ofs) + { + ucs4_t uc; + return u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s.string + ofs), + s.length - ofs); + } + else + return 0; +} /* Initializes ST as an empty string. */ void diff --git a/src/libpspp/str.h b/src/libpspp/str.h index ecf9e6eb3f..ddfd2f82b7 100644 --- a/src/libpspp/str.h +++ b/src/libpspp/str.h @@ -23,6 +23,7 @@ #include #include #include +#include #include "compiler.h" #include "memcasecmp.h" @@ -127,6 +128,13 @@ int ss_equals (struct substring, struct substring); int ss_equals_case (struct substring, struct substring); size_t ss_pointer_to_position (struct substring, const char *); char *ss_xstrdup (struct substring); + +/* UTF-8. */ +ucs4_t ss_first_mb (struct substring); +int ss_first_mblen (struct substring); +ucs4_t ss_get_mb (struct substring *); +ucs4_t ss_at_mb (struct substring, size_t ofs); +int ss_at_mblen (struct substring, size_t ofs); /* Variable length strings. */ -- 2.30.2