#include <string.h>
#include <unicase.h>
#include <unigbrk.h>
+#include <uniwidth.h>
#include "libpspp/assertion.h"
#include "libpspp/compiler.h"
#include "libpspp/hmapx.h"
#include "libpspp/hash-functions.h"
+#include "libpspp/misc.h"
#include "libpspp/pool.h"
#include "libpspp/str.h"
#include "libpspp/version.h"
return prefix_len + tail_len;
}
+/* Returns the number of display columns that would be occupied by the LENGTH
+ bytes of UTF-8 starting at S. */
+size_t
+utf8_count_columns (const char *s_, size_t length)
+{
+ const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
+
+ size_t columns = 0;
+ for (int ofs = 0; ofs < length; )
+ {
+ ucs4_t uc;
+ ofs += u8_mbtouc (&uc, s + ofs, length - ofs);
+ if (uc != '\t')
+ {
+ int width = uc_width (uc, "UTF-8");
+ if (width > 0)
+ columns += width;
+ }
+ else
+ columns = ROUND_UP (columns + 1, 8);
+ }
+ return columns;
+}
+
+/* Returns the byte offset in LENGTH-byte UTF-8 string S that is N_COLUMNS
+ display columns into the string. */
+size_t
+utf8_columns_to_bytes (const char *s_, size_t length, size_t n_columns)
+{
+ const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
+
+ size_t columns = 0;
+ int ofs;
+ for (ofs = 0; ofs < length && columns < n_columns; )
+ {
+ ucs4_t uc;
+ ofs += u8_mbtouc (&uc, s + ofs, length - ofs);
+ if (uc != '\t')
+ {
+ int width = uc_width (uc, "UTF-8");
+ if (width > 0)
+ columns += width;
+ }
+ else
+ columns = ROUND_UP (columns + 1, 8);
+ }
+ return ofs;
+}
+
/* Returns an allocated, null-terminated string, owned by the caller,
containing as many characters[*] from the beginning of S that would fit
within MAX_LEN bytes if the returned string were to be re-encoded in
unsigned int
utf8_hash_case_string (const char *s, unsigned int basis)
{
- return utf8_hash_case_bytes (s, strlen (s), basis);
+ return utf8_hash_case_substring (ss_cstr (s), basis);
+}
+
+/* Returns a hash value for UTF-8 string S, with lowercase and uppercase
+ letters treated as equal, starting from BASIS. */
+unsigned int
+utf8_hash_case_substring (struct substring s, unsigned int basis)
+{
+ return utf8_hash_case_bytes (s.string, s.length, basis);
}
/* Compares UTF-8 strings A and B case-insensitively.
int
utf8_strcasecmp (const char *a, const char *b)
{
- return utf8_strncasecmp (a, strlen (a), b, strlen (b));
+ return utf8_sscasecmp (ss_cstr (a), ss_cstr (b));
+}
+
+int
+utf8_sscasecmp (struct substring a, struct substring b)
+{
+ return utf8_strncasecmp (a.string, a.length, b.string, b.length);
}
/* Compares UTF-8 strings A (with length AN) and B (with length BN)